summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig105
-rw-r--r--mm/Kconfig.debug2
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c139
-rw-r--r--mm/cma_debug.c5
-rw-r--r--mm/compaction.c71
-rw-r--r--mm/damon/Kconfig3
-rw-r--r--mm/damon/Makefile6
-rw-r--r--mm/damon/core-test.h29
-rw-r--r--mm/damon/core.c560
-rw-r--r--mm/damon/dbgfs.c78
-rw-r--r--mm/damon/lru_sort.c451
-rw-r--r--mm/damon/modules-common.c42
-rw-r--r--mm/damon/modules-common.h49
-rw-r--r--mm/damon/ops-common.c50
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c43
-rw-r--r--mm/damon/reclaim.c362
-rw-r--r--mm/damon/sysfs-common.c107
-rw-r--r--mm/damon/sysfs-common.h56
-rw-r--r--mm/damon/sysfs-schemes.c1338
-rw-r--r--mm/damon/sysfs.c1326
-rw-r--r--mm/damon/vaddr-test.h36
-rw-r--r--mm/damon/vaddr.c106
-rw-r--r--mm/debug.c19
-rw-r--r--mm/debug_vm_pgtable.c40
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/filemap.c220
-rw-r--r--mm/folio-compat.c25
-rw-r--r--mm/frontswap.c3
-rw-r--r--mm/gup.c605
-rw-r--r--mm/gup_test.c145
-rw-r--r--mm/gup_test.h12
-rw-r--r--mm/highmem.c43
-rw-r--r--mm/hmm.c5
-rw-r--r--mm/huge_memory.c396
-rw-r--r--mm/hugetlb.c1576
-rw-r--r--mm/hugetlb_cgroup.c90
-rw-r--r--mm/hugetlb_vmemmap.c56
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/internal.h60
-rw-r--r--mm/kasan/Makefile8
-rw-r--r--mm/kasan/common.c177
-rw-r--r--mm/kasan/generic.c161
-rw-r--r--mm/kasan/hw_tags.c39
-rw-r--r--mm/kasan/kasan.h185
-rw-r--r--mm/kasan/kasan_test.c1570
-rw-r--r--mm/kasan/kasan_test_module.c81
-rw-r--r--mm/kasan/report.c206
-rw-r--r--mm/kasan/report_generic.c46
-rw-r--r--mm/kasan/report_tags.c123
-rw-r--r--mm/kasan/shadow.c2
-rw-r--r--mm/kasan/sw_tags.c5
-rw-r--r--mm/kasan/tags.c143
-rw-r--r--mm/kfence/core.c40
-rw-r--r--mm/kfence/kfence_test.c4
-rw-r--r--mm/kfence/report.c15
-rw-r--r--mm/khugepaged.c1294
-rw-r--r--mm/kmemleak.c130
-rw-r--r--mm/kmsan/Makefile28
-rw-r--r--mm/kmsan/core.c450
-rw-r--r--mm/kmsan/hooks.c385
-rw-r--r--mm/kmsan/init.c235
-rw-r--r--mm/kmsan/instrumentation.c310
-rw-r--r--mm/kmsan/kmsan.h211
-rw-r--r--mm/kmsan/kmsan_test.c585
-rw-r--r--mm/kmsan/report.c219
-rw-r--r--mm/kmsan/shadow.c299
-rw-r--r--mm/ksm.c462
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c148
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memblock.c6
-rw-r--r--mm/memcontrol.c435
-rw-r--r--mm/memory-failure.c316
-rw-r--r--mm/memory-tiers.c732
-rw-r--r--mm/memory.c581
-rw-r--r--mm/memory_hotplug.c11
-rw-r--r--mm/mempolicy.c66
-rw-r--r--mm/mempool.c18
-rw-r--r--mm/memremap.c33
-rw-r--r--mm/migrate.c943
-rw-r--r--mm/migrate_device.c280
-rw-r--r--mm/mincore.c14
-rw-r--r--mm/mlock.c37
-rw-r--r--mm/mm_init.c14
-rw-r--r--mm/mm_slot.h55
-rw-r--r--mm/mmap.c2261
-rw-r--r--mm/mmu_gather.c52
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c72
-rw-r--r--mm/mremap.c42
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c260
-rw-r--r--mm/oom_kill.c11
-rw-r--r--mm/page-writeback.c131
-rw-r--r--mm/page_alloc.c472
-rw-r--r--mm/page_counter.c15
-rw-r--r--mm/page_ext.c119
-rw-r--r--mm/page_io.c47
-rw-r--r--mm/page_isolation.c42
-rw-r--r--mm/page_owner.c103
-rw-r--r--mm/page_reporting.c50
-rw-r--r--mm/page_table_check.c17
-rw-r--r--mm/page_vma_mapped.c6
-rw-r--r--mm/pagewalk.c39
-rw-r--r--mm/percpu.c44
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/readahead.c22
-rw-r--r--mm/rmap.c516
-rw-r--r--mm/rodata_test.c8
-rw-r--r--mm/secretmem.c10
-rw-r--r--mm/shmem.c523
-rw-r--r--mm/shuffle.c21
-rw-r--r--mm/slab.c457
-rw-r--r--mm/slab.h97
-rw-r--r--mm/slab_common.c309
-rw-r--r--mm/slob.c45
-rw-r--r--mm/slub.c1341
-rw-r--r--mm/sparse-vmemmap.c73
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c114
-rw-r--r--mm/swap.h26
-rw-r--r--mm/swap_cgroup.c6
-rw-r--r--mm/swap_slots.c2
-rw-r--r--mm/swap_state.c152
-rw-r--r--mm/swapfile.c195
-rw-r--r--mm/truncate.c34
-rw-r--r--mm/usercopy.c3
-rw-r--r--mm/userfaultfd.c63
-rw-r--r--mm/util.c117
-rw-r--r--mm/vmacache.c117
-rw-r--r--mm/vmalloc.c55
-rw-r--r--mm/vmscan.c3472
-rw-r--r--mm/vmstat.c49
-rw-r--r--mm/workingset.c117
-rw-r--r--mm/z3fold.c36
-rw-r--r--mm/zbud.c32
-rw-r--r--mm/zpool.c17
-rw-r--r--mm/zsmalloc.c384
-rw-r--r--mm/zswap.c41
143 files changed, 21129 insertions, 10513 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0331f1461f81..ff7b209dec05 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -23,7 +23,7 @@ menuconfig SWAP
in your computer. If unsure say Y.
config ZSWAP
- bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ bool "Compressed cache for swap pages"
depends on SWAP
select FRONTSWAP
select CRYPTO
@@ -36,12 +36,6 @@ config ZSWAP
in the case where decompressing from RAM is faster than swap device
reads, can also improve workload performance.
- This is marked experimental because it is a new feature (as of
- v3.11) that interacts heavily with memory reclaim. While these
- interactions don't cause any known issues on simple memory setups,
- they have not be fully explored on the large set of potential
- configurations and workloads that exist.
-
config ZSWAP_DEFAULT_ON
bool "Enable the compressed cache for swap pages by default"
depends on ZSWAP
@@ -225,17 +219,43 @@ config SLUB
and has enhanced diagnostics. SLUB is the default choice for
a slab allocator.
-config SLOB
+config SLOB_DEPRECATED
depends on EXPERT
- bool "SLOB (Simple Allocator)"
+ bool "SLOB (Simple Allocator - DEPRECATED)"
depends on !PREEMPT_RT
help
+ Deprecated and scheduled for removal in a few cycles. SLUB
+ recommended as replacement. CONFIG_SLUB_TINY can be considered
+ on systems with 16MB or less RAM.
+
+ If you need SLOB to stay, please contact linux-mm@kvack.org and
+ people listed in the SLAB ALLOCATOR section of MAINTAINERS file,
+ with your use case.
+
SLOB replaces the stock allocator with a drastically simpler
allocator. SLOB is generally more space efficient but
does not perform as well on large systems.
endchoice
+config SLOB
+ bool
+ default y
+ depends on SLOB_DEPRECATED
+
+config SLUB_TINY
+ bool "Configure SLUB for minimal memory footprint"
+ depends on SLUB && EXPERT
+ select SLAB_MERGE_DEFAULT
+ help
+ Configures the SLUB allocator in a way to achieve minimal memory
+ footprint, sacrificing scalability, debugging and other features.
+ This is intended only for the smallest system that had used the
+ SLOB allocator and is not recommended for systems with more than
+ 16MB RAM.
+
+ If unsure, say N.
+
config SLAB_MERGE_DEFAULT
bool "Allow slab caches to be merged"
default y
@@ -253,7 +273,7 @@ config SLAB_MERGE_DEFAULT
config SLAB_FREELIST_RANDOM
bool "Randomize slab freelist"
- depends on SLAB || SLUB
+ depends on SLAB || (SLUB && !SLUB_TINY)
help
Randomizes the freelist order used on creating new pages. This
security feature reduces the predictability of the kernel slab
@@ -261,7 +281,7 @@ config SLAB_FREELIST_RANDOM
config SLAB_FREELIST_HARDENED
bool "Harden slab freelist metadata"
- depends on SLAB || SLUB
+ depends on SLAB || (SLUB && !SLUB_TINY)
help
Many kernel heap attacks try to target slab cache metadata and
other infrastructure. This options makes minor performance
@@ -273,7 +293,7 @@ config SLAB_FREELIST_HARDENED
config SLUB_STATS
default n
bool "Enable SLUB performance statistics"
- depends on SLUB && SYSFS
+ depends on SLUB && SYSFS && !SLUB_TINY
help
SLUB statistics are useful to debug SLUBs allocation behavior in
order find ways to optimize the allocator. This should never be
@@ -285,7 +305,7 @@ config SLUB_STATS
config SLUB_CPU_PARTIAL
default y
- depends on SLUB && SMP
+ depends on SLUB && SMP && !SLUB_TINY
bool "SLUB per cpu partial cache"
help
Per cpu partial caches accelerate objects allocation and freeing
@@ -579,6 +599,12 @@ config COMPACTION
it and then we would be really interested to hear about that at
linux-mm@kvack.org.
+config COMPACT_UNEVICTABLE_DEFAULT
+ int
+ depends on COMPACTION
+ default 0 if PREEMPT_RT
+ default 1
+
#
# support for free page reporting
config PAGE_REPORTING
@@ -775,7 +801,7 @@ endchoice
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now, swap cluster backing transparent huge page
@@ -1005,6 +1031,14 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
+config ARCH_USES_PG_ARCH_X
+ bool
+ help
+ Enable the definition of PG_arch_x page flags with x > 1. Only
+ suitable for 64-bit architectures with CONFIG_FLATMEM or
+ CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
+ enough room for additional bits in page->flags.
+
config VM_EVENT_COUNTERS
default y
bool "Enable VM event counters for /proc/vmstat" if EXPERT
@@ -1044,7 +1078,7 @@ config GUP_TEST
comment "GUP_TEST needs to have DEBUG_FS enabled"
depends on !GUP_TEST && !DEBUG_FS
-config GUP_GET_PTE_LOW_HIGH
+config GUP_GET_PXX_LOW_HIGH
bool
config ARCH_HAS_PTE_SPECIAL
@@ -1074,7 +1108,13 @@ config IO_MAPPING
bool
config SECRETMEM
- def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+ default y
+ bool "Enable memfd_secret() system call" if EXPERT
+ depends on ARCH_HAS_SET_DIRECT_MAP
+ help
+ Enable the memfd_secret() system call with the ability to create
+ memory areas visible only in the context of the owning process and
+ not mapped to other processes and other kernel page tables.
config ANON_VMA_NAME
bool "Anonymous VMA name support"
@@ -1107,23 +1147,42 @@ config HAVE_ARCH_USERFAULTFD_MINOR
help
Arch has userfaultfd minor fault support
-config PTE_MARKER
- bool
-
- help
- Allows to create marker PTEs for file-backed memory.
-
config PTE_MARKER_UFFD_WP
bool "Userfaultfd write protection support for shmem/hugetlbfs"
default y
depends on HAVE_ARCH_USERFAULTFD_WP
- select PTE_MARKER
help
Allows to create marker PTEs for userfaultfd write protection
purposes. It is required to enable userfaultfd write protection on
file-backed memory types like shmem and hugetlbfs.
+# multi-gen LRU {
+config LRU_GEN
+ bool "Multi-Gen LRU"
+ depends on MMU
+ # make sure folio->flags has enough spare bits
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
+ help
+ A high performance LRU implementation to overcommit memory. See
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
+
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
+config LRU_GEN_STATS
+ bool "Full stats for debugging"
+ depends on LRU_GEN
+ help
+ Do not enable this option unless you plan to look at historical stats
+ from evicted generations for debugging purpose.
+
+ This option has a per-memcg and per-node memory overhead.
+# }
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index ce8dded36de9..fca699ad1fb0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -56,7 +56,7 @@ config DEBUG_SLAB
config SLUB_DEBUG
default y
bool "Enable SLUB debugging support" if EXPERT
- depends on SLUB && SYSFS
+ depends on SLUB && SYSFS && !SLUB_TINY
select STACKDEPOT if STACKTRACE_SUPPORT
help
SLUB has extensive debug support features. Disabling these can
diff --git a/mm/Makefile b/mm/Makefile
index 9a564f836403..8e105e5b3e29 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o vmacache.o \
+ compaction.o \
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -89,14 +89,18 @@ obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
+obj-$(CONFIG_KMSAN) += kmsan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
-obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
+ifdef CONFIG_SWAP
+obj-$(CONFIG_MEMCG) += swap_cgroup.o
+endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index de65cb1e5f76..a53b9360b72e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -178,7 +178,26 @@ static ssize_t min_ratio_store(struct device *dev,
return ret;
}
-BDI_SHOW(min_ratio, bdi->min_ratio)
+BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)
+
+static ssize_t min_ratio_fine_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio_no_scale(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(min_ratio_fine, bdi->min_ratio)
static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
@@ -197,7 +216,82 @@ static ssize_t max_ratio_store(struct device *dev,
return ret;
}
-BDI_SHOW(max_ratio, bdi->max_ratio)
+BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)
+
+static ssize_t max_ratio_fine_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio_no_scale(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(max_ratio_fine, bdi->max_ratio)
+
+static ssize_t min_bytes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
+}
+
+static ssize_t min_bytes_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ u64 bytes;
+ ssize_t ret;
+
+ ret = kstrtoull(buf, 10, &bytes);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_bytes(bdi, bytes);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+DEVICE_ATTR_RW(min_bytes);
+
+static ssize_t max_bytes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
+}
+
+static ssize_t max_bytes_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ u64 bytes;
+ ssize_t ret;
+
+ ret = kstrtoull(buf, 10, &bytes);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_bytes(bdi, bytes);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+DEVICE_ATTR_RW(max_bytes);
static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
@@ -209,11 +303,44 @@ static ssize_t stable_pages_required_show(struct device *dev,
}
static DEVICE_ATTR_RO(stable_pages_required);
+static ssize_t strict_limit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int strict_limit;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &strict_limit);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_strict_limit(bdi, strict_limit);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+
+static ssize_t strict_limit_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%d\n",
+ !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
+}
+static DEVICE_ATTR_RW(strict_limit);
+
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
+ &dev_attr_min_ratio_fine.attr,
&dev_attr_max_ratio.attr,
+ &dev_attr_max_ratio_fine.attr,
+ &dev_attr_min_bytes.attr,
+ &dev_attr_max_bytes.attr,
&dev_attr_stable_pages_required.attr,
+ &dev_attr_strict_limit.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
@@ -776,21 +903,17 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
int bdi_init(struct backing_dev_info *bdi)
{
- int ret;
-
bdi->dev = NULL;
kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
- bdi->max_ratio = 100;
+ bdi->max_ratio = 100 * BDI_RATIO_SCALE;
bdi->max_prop_frac = FPROP_FRAC_BASE;
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
- ret = cgwb_bdi_init(bdi);
-
- return ret;
+ return cgwb_bdi_init(bdi);
}
struct backing_dev_info *bdi_alloc(int node_id)
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index c3ffe253e055..602fff89b15f 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
- char name[CMA_MAX_NAME];
- scnprintf(name, sizeof(name), "cma-%s", cma->name);
-
- tmp = debugfs_create_dir(name, root_dentry);
+ tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
diff --git a/mm/compaction.c b/mm/compaction.c
index 640fa76228dd..ca1603524bbe 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -52,8 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
-#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
-#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
/*
* Page order with-respect-to which proactive compaction
@@ -404,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
if (cc->ignore_skip_hint)
return false;
- if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(pfn))
return false;
skip = get_pageblock_skip(page);
@@ -886,7 +884,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!valid_page && pageblock_aligned(low_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
page = NULL;
@@ -987,28 +985,28 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto isolate_fail;
+
+ /*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
mapping = page_mapping(page);
- if (!mapping && page_count(page) > page_mapcount(page))
- goto isolate_fail;
+ if (!mapping && (page_count(page) - 1) > total_mapcount(page))
+ goto isolate_fail_put;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
if (!(cc->gfp_mask & __GFP_FS) && mapping)
- goto isolate_fail;
-
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- if (unlikely(!get_page_unless_zero(page)))
- goto isolate_fail;
+ goto isolate_fail_put;
/* Only take pages on LRU: a check now makes later tests safe */
if (!PageLRU(page))
@@ -1346,7 +1344,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
}
static void
-fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+fast_isolate_around(struct compact_control *cc, unsigned long pfn)
{
unsigned long start_pfn, end_pfn;
struct page *page;
@@ -1367,21 +1365,13 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
if (!page)
return;
- /* Scan before */
- if (start_pfn != pfn) {
- isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
- if (cc->nr_freepages >= cc->nr_migratepages)
- return;
- }
-
- /* Scan after */
- start_pfn = pfn + nr_isolated;
- if (start_pfn < end_pfn)
- isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
if (cc->nr_freepages < cc->nr_migratepages)
set_pageblock_skip(page);
+
+ return;
}
/* Search orders in round-robin fashion */
@@ -1558,7 +1548,7 @@ fast_isolate_freepages(struct compact_control *cc)
return cc->free_pfn;
low_pfn = page_to_pfn(page);
- fast_isolate_around(cc, low_pfn, nr_isolated);
+ fast_isolate_around(cc, low_pfn);
return low_pfn;
}
@@ -1727,11 +1717,7 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
-#ifdef CONFIG_PREEMPT_RT
-int sysctl_compact_unevictable_allowed __read_mostly = 0;
-#else
-int sysctl_compact_unevictable_allowed __read_mostly = 1;
-#endif
+int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1853,7 +1839,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
@@ -1939,7 +1924,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ if (pageblock_aligned(low_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
@@ -1981,9 +1966,21 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+/*
+ * Determine whether kswapd is (or recently was!) running on this node.
+ *
+ * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
+ * zero it.
+ */
static bool kswapd_is_running(pg_data_t *pgdat)
{
- return pgdat->kswapd && task_is_running(pgdat->kswapd);
+ bool running;
+
+ pgdat_kswapd_lock(pgdat);
+ running = pgdat->kswapd && task_is_running(pgdat->kswapd);
+ pgdat_kswapd_unlock(pgdat);
+
+ return running;
}
/*
@@ -2113,7 +2110,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* migration source is unmovable/reclaimable but it's not worth
* special casing.
*/
- if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 66265e3a9c65..7821fcb3f258 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -68,6 +68,9 @@ config DAMON_DBGFS
If unsure, say N.
+ This will be removed after >5.15.y LTS kernel is released, so users
+ should move to the sysfs interface (DAMON_SYSFS).
+
config DAMON_DBGFS_KUNIT_TEST
bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_DBGFS && KUNIT=y
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 3e6b8ad73858..f7add3f4aa79 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,7 +3,7 @@
obj-y := core.o
obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
-obj-$(CONFIG_DAMON_SYSFS) += sysfs.o
+obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
-obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
-obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o
+obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 573669566f84..3db9b7368756 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -126,7 +126,7 @@ static void damon_test_split_at(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 100);
damon_add_region(r, t);
- damon_split_region_at(c, t, r, 25);
+ damon_split_region_at(t, r, 25);
KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
KUNIT_EXPECT_EQ(test, r->ar.end, 25ul);
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(c, t, 2);
+ damon_split_regions_of(t, 2);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(c, t, 4);
+ damon_split_regions_of(t, 4);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
@@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
}
+static void damon_test_set_regions(struct kunit *test)
+{
+ struct damon_target *t = damon_new_target();
+ struct damon_region *r1 = damon_new_region(4, 16);
+ struct damon_region *r2 = damon_new_region(24, 32);
+ struct damon_addr_range range = {.start = 8, .end = 28};
+ unsigned long expects[] = {8, 16, 16, 24, 24, 28};
+ int expect_idx = 0;
+ struct damon_region *r;
+
+ damon_add_region(r1, t);
+ damon_add_region(r2, t);
+ damon_set_regions(t, &range, 1);
+
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
+ damon_for_each_region(r, t) {
+ KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
+ KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
+ }
+ damon_destroy_target(t);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_merge_regions_of),
KUNIT_CASE(damon_test_split_regions_of),
KUNIT_CASE(damon_test_ops_registration),
+ KUNIT_CASE(damon_test_set_regions),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7d25dc582fe3..ceec75b88ef9 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -29,6 +29,8 @@ static bool running_exclusive_ctxs;
static DEFINE_MUTEX(damon_ops_lock);
static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
+static struct kmem_cache *damon_region_cache __ro_after_init;
+
/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
static bool __damon_is_registered_ops(enum damon_ops_id id)
{
@@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
{
struct damon_region *region;
- region = kmalloc(sizeof(*region), GFP_KERNEL);
+ region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL);
if (!region)
return NULL;
@@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t)
static void damon_free_region(struct damon_region *r)
{
- kfree(r);
+ kmem_cache_free(damon_region_cache, r);
}
void damon_destroy_region(struct damon_region *r, struct damon_target *t)
@@ -169,6 +171,30 @@ static bool damon_intersect(struct damon_region *r,
}
/*
+ * Fill holes in regions with new regions.
+ */
+static int damon_fill_regions_holes(struct damon_region *first,
+ struct damon_region *last, struct damon_target *t)
+{
+ struct damon_region *r = first;
+
+ damon_for_each_region_from(r, t) {
+ struct damon_region *next, *newr;
+
+ if (r == last)
+ break;
+ next = damon_next_region(r);
+ if (r->ar.end != next->ar.start) {
+ newr = damon_new_region(r->ar.end, next->ar.start);
+ if (!newr)
+ return -ENOMEM;
+ damon_insert_region(newr, r, next, t);
+ }
+ }
+ return 0;
+}
+
+/*
* damon_set_regions() - Set regions of a target for given address ranges.
* @t: the given target.
* @ranges: array of new monitoring target ranges.
@@ -184,6 +210,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
{
struct damon_region *r, *next;
unsigned int i;
+ int err;
/* Remove regions which are not in the new ranges */
damon_for_each_region_safe(r, next, t) {
@@ -195,6 +222,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
damon_destroy_region(r, t);
}
+ r = damon_first_region(t);
/* Add new regions or resize existing regions to fit in the ranges */
for (i = 0; i < nr_ranges; i++) {
struct damon_region *first = NULL, *last, *newr;
@@ -202,7 +230,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
range = &ranges[i];
/* Get the first/last regions intersecting with the range */
- damon_for_each_region(r, t) {
+ damon_for_each_region_from(r, t) {
if (damon_intersect(r, range)) {
if (!first)
first = r;
@@ -225,52 +253,46 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
first->ar.start = ALIGN_DOWN(range->start,
DAMON_MIN_REGION);
last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+
+ /* fill possible holes in the range */
+ err = damon_fill_regions_holes(first, last, t);
+ if (err)
+ return err;
}
}
return 0;
}
-struct damos *damon_new_scheme(
- unsigned long min_sz_region, unsigned long max_sz_region,
- unsigned int min_nr_accesses, unsigned int max_nr_accesses,
- unsigned int min_age_region, unsigned int max_age_region,
- enum damos_action action, struct damos_quota *quota,
- struct damos_watermarks *wmarks)
+/* initialize private fields of damos_quota and return the pointer */
+static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
+{
+ quota->total_charged_sz = 0;
+ quota->total_charged_ns = 0;
+ quota->esz = 0;
+ quota->charged_sz = 0;
+ quota->charged_from = 0;
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ return quota;
+}
+
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+ enum damos_action action, struct damos_quota *quota,
+ struct damos_watermarks *wmarks)
{
struct damos *scheme;
scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
if (!scheme)
return NULL;
- scheme->min_sz_region = min_sz_region;
- scheme->max_sz_region = max_sz_region;
- scheme->min_nr_accesses = min_nr_accesses;
- scheme->max_nr_accesses = max_nr_accesses;
- scheme->min_age_region = min_age_region;
- scheme->max_age_region = max_age_region;
+ scheme->pattern = *pattern;
scheme->action = action;
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
- scheme->quota.ms = quota->ms;
- scheme->quota.sz = quota->sz;
- scheme->quota.reset_interval = quota->reset_interval;
- scheme->quota.weight_sz = quota->weight_sz;
- scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
- scheme->quota.weight_age = quota->weight_age;
- scheme->quota.total_charged_sz = 0;
- scheme->quota.total_charged_ns = 0;
- scheme->quota.esz = 0;
- scheme->quota.charged_sz = 0;
- scheme->quota.charged_from = 0;
- scheme->quota.charge_target_from = NULL;
- scheme->quota.charge_addr_from = 0;
-
- scheme->wmarks.metric = wmarks->metric;
- scheme->wmarks.interval = wmarks->interval;
- scheme->wmarks.high = wmarks->high;
- scheme->wmarks.mid = wmarks->mid;
- scheme->wmarks.low = wmarks->low;
+ scheme->quota = *(damos_quota_init_priv(quota));
+
+ scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
return scheme;
@@ -313,6 +335,7 @@ struct damon_target *damon_new_target(void)
t->pid = NULL;
t->nr_regions = 0;
INIT_LIST_HEAD(&t->regions_list);
+ INIT_LIST_HEAD(&t->list);
return t;
}
@@ -360,17 +383,17 @@ struct damon_ctx *damon_new_ctx(void)
if (!ctx)
return NULL;
- ctx->sample_interval = 5 * 1000;
- ctx->aggr_interval = 100 * 1000;
- ctx->ops_update_interval = 60 * 1000 * 1000;
+ ctx->attrs.sample_interval = 5 * 1000;
+ ctx->attrs.aggr_interval = 100 * 1000;
+ ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
ktime_get_coarse_ts64(&ctx->last_aggregation);
ctx->last_ops_update = ctx->last_aggregation;
mutex_init(&ctx->kdamond_lock);
- ctx->min_nr_regions = 10;
- ctx->max_nr_regions = 1000;
+ ctx->attrs.min_nr_regions = 10;
+ ctx->attrs.max_nr_regions = 1000;
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -406,32 +429,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
/**
* damon_set_attrs() - Set attributes for the monitoring.
* @ctx: monitoring context
- * @sample_int: time interval between samplings
- * @aggr_int: time interval between aggregations
- * @ops_upd_int: time interval between monitoring operations updates
- * @min_nr_reg: minimal number of regions
- * @max_nr_reg: maximum number of regions
+ * @attrs: monitoring attributes
*
* This function should not be called while the kdamond is running.
* Every time interval is in micro-seconds.
*
* Return: 0 on success, negative error code otherwise.
*/
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
- unsigned long aggr_int, unsigned long ops_upd_int,
- unsigned long min_nr_reg, unsigned long max_nr_reg)
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
{
- if (min_nr_reg < 3)
+ if (attrs->min_nr_regions < 3)
return -EINVAL;
- if (min_nr_reg > max_nr_reg)
+ if (attrs->min_nr_regions > attrs->max_nr_regions)
return -EINVAL;
- ctx->sample_interval = sample_int;
- ctx->aggr_interval = aggr_int;
- ctx->ops_update_interval = ops_upd_int;
- ctx->min_nr_regions = min_nr_reg;
- ctx->max_nr_regions = max_nr_reg;
-
+ ctx->attrs = *attrs;
return 0;
}
@@ -443,10 +455,8 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
*
* This function should not be called while the kdamond of the context is
* running.
- *
- * Return: 0 if success, or negative error code otherwise.
*/
-int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
ssize_t nr_schemes)
{
struct damos *s, *next;
@@ -456,7 +466,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
damon_destroy_scheme(s);
for (i = 0; i < nr_schemes; i++)
damon_add_scheme(ctx, schemes[i]);
- return 0;
}
/**
@@ -482,11 +491,11 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- sz += r->ar.end - r->ar.start;
+ sz += damon_sz_region(r);
}
- if (ctx->min_nr_regions)
- sz /= ctx->min_nr_regions;
+ if (ctx->attrs.min_nr_regions)
+ sz /= ctx->attrs.min_nr_regions;
if (sz < DAMON_MIN_REGION)
sz = DAMON_MIN_REGION;
@@ -635,7 +644,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline,
static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
{
return damon_check_reset_time_interval(&ctx->last_aggregation,
- ctx->aggr_interval);
+ ctx->attrs.aggr_interval);
}
/*
@@ -658,19 +667,20 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
}
}
-static void damon_split_region_at(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- unsigned long sz_r);
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r);
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
- sz = r->ar.end - r->ar.start;
- return s->min_sz_region <= sz && sz <= s->max_sz_region &&
- s->min_nr_accesses <= r->nr_accesses &&
- r->nr_accesses <= s->max_nr_accesses &&
- s->min_age_region <= r->age && r->age <= s->max_age_region;
+ sz = damon_sz_region(r);
+ return s->pattern.min_sz_region <= sz &&
+ sz <= s->pattern.max_sz_region &&
+ s->pattern.min_nr_accesses <= r->nr_accesses &&
+ r->nr_accesses <= s->pattern.max_nr_accesses &&
+ s->pattern.min_age_region <= r->age &&
+ r->age <= s->pattern.max_age_region;
}
static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
@@ -684,6 +694,115 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
}
+/*
+ * damos_skip_charged_region() - Check if the given region or starting part of
+ * it is already charged for the DAMOS quota.
+ * @t: The target of the region.
+ * @rp: The pointer to the region.
+ * @s: The scheme to be applied.
+ *
+ * If a quota of a scheme has exceeded in a quota charge window, the scheme's
+ * action would applied to only a part of the target access pattern fulfilling
+ * regions. To avoid applying the scheme action to only already applied
+ * regions, DAMON skips applying the scheme action to the regions that charged
+ * in the previous charge window.
+ *
+ * This function checks if a given region should be skipped or not for the
+ * reason. If only the starting part of the region has previously charged,
+ * this function splits the region into two so that the second one covers the
+ * area that not charged in the previous charge widnow and saves the second
+ * region in *rp and returns false, so that the caller can apply DAMON action
+ * to the second one.
+ *
+ * Return: true if the region should be entirely skipped, false otherwise.
+ */
+static bool damos_skip_charged_region(struct damon_target *t,
+ struct damon_region **rp, struct damos *s)
+{
+ struct damon_region *r = *rp;
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz_to_skip;
+
+ /* Skip previously charged regions */
+ if (quota->charge_target_from) {
+ if (t != quota->charge_target_from)
+ return true;
+ if (r == damon_last_region(t)) {
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ return true;
+ }
+ if (quota->charge_addr_from &&
+ r->ar.end <= quota->charge_addr_from)
+ return true;
+
+ if (quota->charge_addr_from && r->ar.start <
+ quota->charge_addr_from) {
+ sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
+ r->ar.start, DAMON_MIN_REGION);
+ if (!sz_to_skip) {
+ if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ return true;
+ sz_to_skip = DAMON_MIN_REGION;
+ }
+ damon_split_region_at(t, r, sz_to_skip);
+ r = damon_next_region(r);
+ *rp = r;
+ }
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ }
+ return false;
+}
+
+static void damos_update_stat(struct damos *s,
+ unsigned long sz_tried, unsigned long sz_applied)
+{
+ s->stat.nr_tried++;
+ s->stat.sz_tried += sz_tried;
+ if (sz_applied)
+ s->stat.nr_applied++;
+ s->stat.sz_applied += sz_applied;
+}
+
+static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz = damon_sz_region(r);
+ struct timespec64 begin, end;
+ unsigned long sz_applied = 0;
+ int err = 0;
+
+ if (c->ops.apply_scheme) {
+ if (quota->esz && quota->charged_sz + sz > quota->esz) {
+ sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+ DAMON_MIN_REGION);
+ if (!sz)
+ goto update_stat;
+ damon_split_region_at(t, r, sz);
+ }
+ ktime_get_coarse_ts64(&begin);
+ if (c->callback.before_damos_apply)
+ err = c->callback.before_damos_apply(c, t, r, s);
+ if (!err)
+ sz_applied = c->ops.apply_scheme(c, t, r, s);
+ ktime_get_coarse_ts64(&end);
+ quota->total_charged_ns += timespec64_to_ns(&end) -
+ timespec64_to_ns(&begin);
+ quota->charged_sz += sz;
+ if (quota->esz && quota->charged_sz >= quota->esz) {
+ quota->charge_target_from = t;
+ quota->charge_addr_from = r->ar.end + 1;
+ }
+ }
+ if (s->action != DAMOS_STAT)
+ r->age = 0;
+
+update_stat:
+ damos_update_stat(s, sz, sz_applied);
+}
+
static void damon_do_apply_schemes(struct damon_ctx *c,
struct damon_target *t,
struct damon_region *r)
@@ -692,9 +811,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
- unsigned long sz = r->ar.end - r->ar.start;
- struct timespec64 begin, end;
- unsigned long sz_applied = 0;
if (!s->wmarks.activated)
continue;
@@ -703,70 +819,13 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- /* Skip previously charged regions */
- if (quota->charge_target_from) {
- if (t != quota->charge_target_from)
- continue;
- if (r == damon_last_region(t)) {
- quota->charge_target_from = NULL;
- quota->charge_addr_from = 0;
- continue;
- }
- if (quota->charge_addr_from &&
- r->ar.end <= quota->charge_addr_from)
- continue;
-
- if (quota->charge_addr_from && r->ar.start <
- quota->charge_addr_from) {
- sz = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, DAMON_MIN_REGION);
- if (!sz) {
- if (r->ar.end - r->ar.start <=
- DAMON_MIN_REGION)
- continue;
- sz = DAMON_MIN_REGION;
- }
- damon_split_region_at(c, t, r, sz);
- r = damon_next_region(r);
- sz = r->ar.end - r->ar.start;
- }
- quota->charge_target_from = NULL;
- quota->charge_addr_from = 0;
- }
+ if (damos_skip_charged_region(t, &r, s))
+ continue;
if (!damos_valid_target(c, t, r, s))
continue;
- /* Apply the scheme */
- if (c->ops.apply_scheme) {
- if (quota->esz &&
- quota->charged_sz + sz > quota->esz) {
- sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- DAMON_MIN_REGION);
- if (!sz)
- goto update_stat;
- damon_split_region_at(c, t, r, sz);
- }
- ktime_get_coarse_ts64(&begin);
- sz_applied = c->ops.apply_scheme(c, t, r, s);
- ktime_get_coarse_ts64(&end);
- quota->total_charged_ns += timespec64_to_ns(&end) -
- timespec64_to_ns(&begin);
- quota->charged_sz += sz;
- if (quota->esz && quota->charged_sz >= quota->esz) {
- quota->charge_target_from = t;
- quota->charge_addr_from = r->ar.end + 1;
- }
- }
- if (s->action != DAMOS_STAT)
- r->age = 0;
-
-update_stat:
- s->stat.nr_tried++;
- s->stat.sz_tried += sz;
- if (sz_applied)
- s->stat.nr_applied++;
- s->stat.sz_applied += sz_applied;
+ damos_apply_scheme(c, t, r, s);
}
}
@@ -793,60 +852,64 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->esz = esz;
}
-static void kdamond_apply_schemes(struct damon_ctx *c)
+static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
struct damon_target *t;
- struct damon_region *r, *next_r;
- struct damos *s;
+ struct damon_region *r;
+ unsigned long cumulated_sz;
+ unsigned int score, max_score = 0;
- damon_for_each_scheme(s, c) {
- struct damos_quota *quota = &s->quota;
- unsigned long cumulated_sz;
- unsigned int score, max_score = 0;
+ if (!quota->ms && !quota->sz)
+ return;
- if (!s->wmarks.activated)
- continue;
+ /* New charge window starts */
+ if (time_after_eq(jiffies, quota->charged_from +
+ msecs_to_jiffies(quota->reset_interval))) {
+ if (quota->esz && quota->charged_sz >= quota->esz)
+ s->stat.qt_exceeds++;
+ quota->total_charged_sz += quota->charged_sz;
+ quota->charged_from = jiffies;
+ quota->charged_sz = 0;
+ damos_set_effective_quota(quota);
+ }
- if (!quota->ms && !quota->sz)
- continue;
+ if (!c->ops.get_scheme_score)
+ return;
- /* New charge window starts */
- if (time_after_eq(jiffies, quota->charged_from +
- msecs_to_jiffies(
- quota->reset_interval))) {
- if (quota->esz && quota->charged_sz >= quota->esz)
- s->stat.qt_exceeds++;
- quota->total_charged_sz += quota->charged_sz;
- quota->charged_from = jiffies;
- quota->charged_sz = 0;
- damos_set_effective_quota(quota);
+ /* Fill up the score histogram */
+ memset(quota->histogram, 0, sizeof(quota->histogram));
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ if (!__damos_valid_target(r, s))
+ continue;
+ score = c->ops.get_scheme_score(c, t, r, s);
+ quota->histogram[score] += damon_sz_region(r);
+ if (score > max_score)
+ max_score = score;
}
+ }
- if (!c->ops.get_scheme_score)
- continue;
+ /* Set the min score limit */
+ for (cumulated_sz = 0, score = max_score; ; score--) {
+ cumulated_sz += quota->histogram[score];
+ if (cumulated_sz >= quota->esz || !score)
+ break;
+ }
+ quota->min_score = score;
+}
- /* Fill up the score histogram */
- memset(quota->histogram, 0, sizeof(quota->histogram));
- damon_for_each_target(t, c) {
- damon_for_each_region(r, t) {
- if (!__damos_valid_target(r, s))
- continue;
- score = c->ops.get_scheme_score(
- c, t, r, s);
- quota->histogram[score] +=
- r->ar.end - r->ar.start;
- if (score > max_score)
- max_score = score;
- }
- }
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r, *next_r;
+ struct damos *s;
- /* Set the min score limit */
- for (cumulated_sz = 0, score = max_score; ; score--) {
- cumulated_sz += quota->histogram[score];
- if (cumulated_sz >= quota->esz || !score)
- break;
- }
- quota->min_score = score;
+ damon_for_each_scheme(s, c) {
+ if (!s->wmarks.activated)
+ continue;
+
+ damos_adjust_quota(c, s);
}
damon_for_each_target(t, c) {
@@ -855,18 +918,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
}
-static inline unsigned long sz_damon_region(struct damon_region *r)
-{
- return r->ar.end - r->ar.start;
-}
-
/*
* Merge two adjacent regions into one region
*/
static void damon_merge_two_regions(struct damon_target *t,
struct damon_region *l, struct damon_region *r)
{
- unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
+ unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
@@ -895,7 +953,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
if (prev && prev->ar.end == r->ar.start &&
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
- sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
+ damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
prev = r;
@@ -928,9 +986,8 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
* r the region to be split
* sz_r size of the first sub-region that will be made
*/
-static void damon_split_region_at(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- unsigned long sz_r)
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r)
{
struct damon_region *new;
@@ -947,15 +1004,14 @@ static void damon_split_region_at(struct damon_ctx *ctx,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_ctx *ctx,
- struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
int i;
damon_for_each_region_safe(r, next, t) {
- sz_region = r->ar.end - r->ar.start;
+ sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
sz_region > 2 * DAMON_MIN_REGION; i++) {
@@ -969,7 +1025,7 @@ static void damon_split_regions_of(struct damon_ctx *ctx,
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
- damon_split_region_at(ctx, t, r, sz_sub);
+ damon_split_region_at(t, r, sz_sub);
sz_region = sz_sub;
}
}
@@ -995,16 +1051,16 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
damon_for_each_target(t, ctx)
nr_regions += damon_nr_regions(t);
- if (nr_regions > ctx->max_nr_regions / 2)
+ if (nr_regions > ctx->attrs.max_nr_regions / 2)
return;
/* Maybe the middle of the region has different access frequency */
if (last_nr_regions == nr_regions &&
- nr_regions < ctx->max_nr_regions / 3)
+ nr_regions < ctx->attrs.max_nr_regions / 3)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(ctx, t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions);
last_nr_regions = nr_regions;
}
@@ -1018,7 +1074,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
static bool kdamond_need_update_operations(struct damon_ctx *ctx)
{
return damon_check_reset_time_interval(&ctx->last_ops_update,
- ctx->ops_update_interval);
+ ctx->attrs.ops_update_interval);
}
/*
@@ -1142,32 +1198,27 @@ static int kdamond_fn(void *data)
struct damon_region *r, *next;
unsigned int max_nr_accesses = 0;
unsigned long sz_limit = 0;
- bool done = false;
pr_debug("kdamond (%d) starts\n", current->pid);
if (ctx->ops.init)
ctx->ops.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
- done = true;
+ goto done;
sz_limit = damon_region_sz_limit(ctx);
- while (!kdamond_need_stop(ctx) && !done) {
- if (kdamond_wait_activation(ctx)) {
- done = true;
- continue;
- }
+ while (!kdamond_need_stop(ctx)) {
+ if (kdamond_wait_activation(ctx))
+ break;
if (ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
if (ctx->callback.after_sampling &&
- ctx->callback.after_sampling(ctx)) {
- done = true;
- continue;
- }
+ ctx->callback.after_sampling(ctx))
+ break;
- kdamond_usleep(ctx->sample_interval);
+ kdamond_usleep(ctx->attrs.sample_interval);
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
@@ -1177,10 +1228,8 @@ static int kdamond_fn(void *data)
max_nr_accesses / 10,
sz_limit);
if (ctx->callback.after_aggregation &&
- ctx->callback.after_aggregation(ctx)) {
- done = true;
- continue;
- }
+ ctx->callback.after_aggregation(ctx))
+ break;
kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
@@ -1194,6 +1243,7 @@ static int kdamond_fn(void *data)
sz_limit = damon_region_sz_limit(ctx);
}
}
+done:
damon_for_each_target(t, ctx) {
damon_for_each_region_safe(r, next, t)
damon_destroy_region(r, t);
@@ -1218,4 +1268,90 @@ static int kdamond_fn(void *data)
return 0;
}
+/*
+ * struct damon_system_ram_region - System RAM resource address region of
+ * [@start, @end).
+ * @start: Start address of the region (inclusive).
+ * @end: End address of the region (exclusive).
+ */
+struct damon_system_ram_region {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+ struct damon_system_ram_region *a = arg;
+
+ if (a->end - a->start < resource_size(res)) {
+ a->start = res->start;
+ a->end = res->end;
+ }
+ return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively. If no System RAM is found, returns false.
+ */
+static bool damon_find_biggest_system_ram(unsigned long *start,
+ unsigned long *end)
+
+{
+ struct damon_system_ram_region arg = {};
+
+ walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+ if (arg.end <= arg.start)
+ return false;
+
+ *start = arg.start;
+ *end = arg.end;
+ return true;
+}
+
+/**
+ * damon_set_region_biggest_system_ram_default() - Set the region of the given
+ * monitoring target as requested, or biggest 'System RAM'.
+ * @t: The monitoring target to set the region.
+ * @start: The pointer to the start address of the region.
+ * @end: The pointer to the end address of the region.
+ *
+ * This function sets the region of @t as requested by @start and @end. If the
+ * values of @start and @end are zero, however, this function finds the biggest
+ * 'System RAM' resource and sets the region to cover the resource. In the
+ * latter case, this function saves the start and end addresses of the resource
+ * in @start and @end, respectively.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+ unsigned long *start, unsigned long *end)
+{
+ struct damon_addr_range addr_range;
+
+ if (*start > *end)
+ return -EINVAL;
+
+ if (!*start && !*end &&
+ !damon_find_biggest_system_ram(start, end))
+ return -EINVAL;
+
+ addr_range.start = *start;
+ addr_range.end = *end;
+ return damon_set_regions(t, &addr_range, 1);
+}
+
+static int __init damon_init(void)
+{
+ damon_region_cache = KMEM_CACHE(damon_region, 0);
+ if (unlikely(!damon_region_cache)) {
+ pr_err("creating damon_region_cache fails\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+subsys_initcall(damon_init);
+
#include "core-test.h"
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index cfdf63132d5a..b3f454a5c682 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file,
mutex_lock(&ctx->kdamond_lock);
ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
- ctx->sample_interval, ctx->aggr_interval,
- ctx->ops_update_interval, ctx->min_nr_regions,
- ctx->max_nr_regions);
+ ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
+ ctx->attrs.ops_update_interval,
+ ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
mutex_unlock(&ctx->kdamond_lock);
return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
@@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
- unsigned long s, a, r, minr, maxr;
+ struct damon_attrs attrs;
char *kbuf;
ssize_t ret;
@@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file,
return PTR_ERR(kbuf);
if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
- &s, &a, &r, &minr, &maxr) != 5) {
+ &attrs.sample_interval, &attrs.aggr_interval,
+ &attrs.ops_update_interval,
+ &attrs.min_nr_regions,
+ &attrs.max_nr_regions) != 5) {
ret = -EINVAL;
goto out;
}
@@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
goto unlock_out;
}
- ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
+ ret = damon_set_attrs(ctx, &attrs);
if (!ret)
ret = count;
unlock_out:
@@ -131,9 +134,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
damon_for_each_scheme(s, c) {
rc = scnprintf(&buf[written], len - written,
"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
- s->min_sz_region, s->max_sz_region,
- s->min_nr_accesses, s->max_nr_accesses,
- s->min_age_region, s->max_age_region,
+ s->pattern.min_sz_region,
+ s->pattern.max_sz_region,
+ s->pattern.min_nr_accesses,
+ s->pattern.max_nr_accesses,
+ s->pattern.min_age_region,
+ s->pattern.max_age_region,
damos_action_to_dbgfs_scheme_action(s->action),
s->quota.ms, s->quota.sz,
s->quota.reset_interval,
@@ -221,8 +227,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
struct damos *scheme, **schemes;
const int max_nr_schemes = 256;
int pos = 0, parsed, ret;
- unsigned long min_sz, max_sz;
- unsigned int min_nr_a, max_nr_a, min_age, max_age;
unsigned int action_input;
enum damos_action action;
@@ -233,13 +237,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
*nr_schemes = 0;
while (pos < len && *nr_schemes < max_nr_schemes) {
+ struct damos_access_pattern pattern = {};
struct damos_quota quota = {};
struct damos_watermarks wmarks;
ret = sscanf(&str[pos],
"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
- &min_sz, &max_sz, &min_nr_a, &max_nr_a,
- &min_age, &max_age, &action_input, &quota.ms,
+ &pattern.min_sz_region, &pattern.max_sz_region,
+ &pattern.min_nr_accesses,
+ &pattern.max_nr_accesses,
+ &pattern.min_age_region,
+ &pattern.max_age_region,
+ &action_input, &quota.ms,
&quota.sz, &quota.reset_interval,
&quota.weight_sz, &quota.weight_nr_accesses,
&quota.weight_age, &wmarks.metric,
@@ -251,7 +260,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
if ((int)action < 0)
goto fail;
- if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
+ if (pattern.min_sz_region > pattern.max_sz_region ||
+ pattern.min_nr_accesses > pattern.max_nr_accesses ||
+ pattern.min_age_region > pattern.max_age_region)
goto fail;
if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
@@ -259,8 +270,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
goto fail;
pos += parsed;
- scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
- min_age, max_age, action, &quota, &wmarks);
+ scheme = damon_new_scheme(&pattern, action, &quota, &wmarks);
if (!scheme)
goto fail;
@@ -297,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
goto unlock_out;
}
- ret = damon_set_schemes(ctx, schemes, nr_schemes);
- if (!ret) {
- ret = count;
- nr_schemes = 0;
- }
+ damon_set_schemes(ctx, schemes, nr_schemes);
+ ret = count;
+ nr_schemes = 0;
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
@@ -882,8 +890,10 @@ out:
static int dbgfs_rm_context(char *name)
{
struct dentry *root, *dir, **new_dirs;
+ struct inode *inode;
struct damon_ctx **new_ctxs;
int i, j;
+ int ret = 0;
if (damon_nr_running_ctxs())
return -EBUSY;
@@ -896,16 +906,24 @@ static int dbgfs_rm_context(char *name)
if (!dir)
return -ENOENT;
+ inode = d_inode(dir);
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = -EINVAL;
+ goto out_dput;
+ }
+
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL);
if (!new_ctxs) {
- kfree(new_dirs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_new_dirs;
}
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -925,7 +943,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--;
- return 0;
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
}
static ssize_t dbgfs_rm_context_write(struct file *file,
@@ -1044,7 +1068,7 @@ static int __init __damon_dbgfs_init(void)
fops[i]);
dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
- dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL);
+ dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
if (!dbgfs_dirs) {
debugfs_remove(dbgfs_root);
return -ENOMEM;
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 9de6f00a71c5..7b8fce2f67a8 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -8,10 +8,10 @@
#define pr_fmt(fmt) "damon-lru-sort: " fmt
#include <linux/damon.h>
-#include <linux/ioport.h>
+#include <linux/kstrtox.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
+
+#include "modules-common.h"
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
@@ -63,109 +63,35 @@ module_param(hot_thres_access_freq, ulong, 0600);
static unsigned long cold_min_age __read_mostly = 120000000;
module_param(cold_min_age, ulong, 0600);
-/*
- * Limit of time for trying the LRU lists sorting in milliseconds.
- *
- * DAMON_LRU_SORT tries to use only up to this time within a time window
- * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used
- * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
- * limit is disabled.
- *
- * 10 ms by default.
- */
-static unsigned long quota_ms __read_mostly = 10;
-module_param(quota_ms, ulong, 0600);
-
-/*
- * The time quota charge reset interval in milliseconds.
- *
- * The charge reset interval for the quota of time (quota_ms). That is,
- * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
- * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
- *
- * 1 second by default.
- */
-static unsigned long quota_reset_interval_ms __read_mostly = 1000;
-module_param(quota_reset_interval_ms, ulong, 0600);
-
-/*
- * The watermarks check time interval in microseconds.
- *
- * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
- * enabled but inactive due to its watermarks rule. 5 seconds by default.
- */
-static unsigned long wmarks_interval __read_mostly = 5000000;
-module_param(wmarks_interval, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the high watermark.
- *
- * If free memory of the system in bytes per thousand bytes is higher than
- * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically
- * checks the watermarks. 200 (20%) by default.
- */
-static unsigned long wmarks_high __read_mostly = 200;
-module_param(wmarks_high, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the middle watermark.
- *
- * If free memory of the system in bytes per thousand bytes is between this and
- * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring
- * and the LRU-lists sorting. 150 (15%) by default.
- */
-static unsigned long wmarks_mid __read_mostly = 150;
-module_param(wmarks_mid, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the low watermark.
- *
- * If free memory of the system in bytes per thousand bytes is lower than this,
- * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks
- * the watermarks. 50 (5%) by default.
- */
-static unsigned long wmarks_low __read_mostly = 50;
-module_param(wmarks_low, ulong, 0600);
-
-/*
- * Sampling interval for the monitoring in microseconds.
- *
- * The sampling interval of DAMON for the hot/cold memory monitoring. Please
- * refer to the DAMON documentation for more detail. 5 ms by default.
- */
-static unsigned long sample_interval __read_mostly = 5000;
-module_param(sample_interval, ulong, 0600);
-
-/*
- * Aggregation interval for the monitoring in microseconds.
- *
- * The aggregation interval of DAMON for the hot/cold memory monitoring.
- * Please refer to the DAMON documentation for more detail. 100 ms by default.
- */
-static unsigned long aggr_interval __read_mostly = 100000;
-module_param(aggr_interval, ulong, 0600);
-
-/*
- * Minimum number of monitoring regions.
- *
- * The minimal number of monitoring regions of DAMON for the hot/cold memory
- * monitoring. This can be used to set lower-bound of the monitoring quality.
- * But, setting this too high could result in increased monitoring overhead.
- * Please refer to the DAMON documentation for more detail. 10 by default.
- */
-static unsigned long min_nr_regions __read_mostly = 10;
-module_param(min_nr_regions, ulong, 0600);
-
-/*
- * Maximum number of monitoring regions.
- *
- * The maximum number of monitoring regions of DAMON for the hot/cold memory
- * monitoring. This can be used to set upper-bound of the monitoring overhead.
- * However, setting this too low could result in bad monitoring quality.
- * Please refer to the DAMON documentation for more detail. 1000 by default.
- */
-static unsigned long max_nr_regions __read_mostly = 1000;
-module_param(max_nr_regions, ulong, 0600);
+static struct damos_quota damon_lru_sort_quota = {
+ /* Use up to 10 ms per 1 sec, by default */
+ .ms = 10,
+ .sz = 0,
+ .reset_interval = 1000,
+ /* Within the quota, mark hotter regions accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 1,
+ .weight_age = 0,
+};
+DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota);
+
+static struct damos_watermarks damon_lru_sort_wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = 5000000, /* 5 seconds */
+ .high = 200, /* 20 percent */
+ .mid = 150, /* 15 percent */
+ .low = 50, /* 5 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks);
+
+static struct damon_attrs damon_lru_sort_mon_attrs = {
+ .sample_interval = 5000, /* 5 ms */
+ .aggr_interval = 100000, /* 100 ms */
+ .ops_update_interval = 0,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+};
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
/*
* Start of the target memory region in physical address.
@@ -194,222 +120,97 @@ module_param(monitor_region_end, ulong, 0600);
static int kdamond_pid __read_mostly = -1;
module_param(kdamond_pid, int, 0400);
-/*
- * Number of hot memory regions that tried to be LRU-sorted.
- */
-static unsigned long nr_lru_sort_tried_hot_regions __read_mostly;
-module_param(nr_lru_sort_tried_hot_regions, ulong, 0400);
-
-/*
- * Total bytes of hot memory regions that tried to be LRU-sorted.
- */
-static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly;
-module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400);
-
-/*
- * Number of hot memory regions that successfully be LRU-sorted.
- */
-static unsigned long nr_lru_sorted_hot_regions __read_mostly;
-module_param(nr_lru_sorted_hot_regions, ulong, 0400);
-
-/*
- * Total bytes of hot memory regions that successfully be LRU-sorted.
- */
-static unsigned long bytes_lru_sorted_hot_regions __read_mostly;
-module_param(bytes_lru_sorted_hot_regions, ulong, 0400);
-
-/*
- * Number of times that the time quota limit for hot regions have exceeded
- */
-static unsigned long nr_hot_quota_exceeds __read_mostly;
-module_param(nr_hot_quota_exceeds, ulong, 0400);
-
-/*
- * Number of cold memory regions that tried to be LRU-sorted.
- */
-static unsigned long nr_lru_sort_tried_cold_regions __read_mostly;
-module_param(nr_lru_sort_tried_cold_regions, ulong, 0400);
-
-/*
- * Total bytes of cold memory regions that tried to be LRU-sorted.
- */
-static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly;
-module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400);
-
-/*
- * Number of cold memory regions that successfully be LRU-sorted.
- */
-static unsigned long nr_lru_sorted_cold_regions __read_mostly;
-module_param(nr_lru_sorted_cold_regions, ulong, 0400);
-
-/*
- * Total bytes of cold memory regions that successfully be LRU-sorted.
- */
-static unsigned long bytes_lru_sorted_cold_regions __read_mostly;
-module_param(bytes_lru_sorted_cold_regions, ulong, 0400);
-
-/*
- * Number of times that the time quota limit for cold regions have exceeded
- */
-static unsigned long nr_cold_quota_exceeds __read_mostly;
-module_param(nr_cold_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_lru_sort_hot_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat,
+ lru_sort_tried_hot_regions, lru_sorted_hot_regions,
+ hot_quota_exceeds);
+
+static struct damos_stat damon_lru_sort_cold_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat,
+ lru_sort_tried_cold_regions, lru_sorted_cold_regions,
+ cold_quota_exceeds);
+
+static struct damos_access_pattern damon_lru_sort_stub_pattern = {
+ /* Find regions having PAGE_SIZE or larger size */
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ /* no matter its access frequency */
+ .min_nr_accesses = 0,
+ .max_nr_accesses = UINT_MAX,
+ /* no matter its age */
+ .min_age_region = 0,
+ .max_age_region = UINT_MAX,
+};
static struct damon_ctx *ctx;
static struct damon_target *target;
-struct damon_lru_sort_ram_walk_arg {
- unsigned long start;
- unsigned long end;
-};
-
-static int walk_system_ram(struct resource *res, void *arg)
+static struct damos *damon_lru_sort_new_scheme(
+ struct damos_access_pattern *pattern, enum damos_action action)
{
- struct damon_lru_sort_ram_walk_arg *a = arg;
-
- if (a->end - a->start < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
- }
- return 0;
-}
+ struct damos_quota quota = damon_lru_sort_quota;
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool get_monitoring_region(unsigned long *start, unsigned long *end)
-{
- struct damon_lru_sort_ram_walk_arg arg = {};
+ /* Use half of total quota for hot/cold pages sorting */
+ quota.ms = quota.ms / 2;
- walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
- if (arg.end <= arg.start)
- return false;
-
- *start = arg.start;
- *end = arg.end;
- return true;
+ return damon_new_scheme(
+ /* find the pattern, and */
+ pattern,
+ /* (de)prioritize on LRU-lists */
+ action,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &damon_lru_sort_wmarks);
}
/* Create a DAMON-based operation scheme for hot memory regions */
static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
{
- struct damos_watermarks wmarks = {
- .metric = DAMOS_WMARK_FREE_MEM_RATE,
- .interval = wmarks_interval,
- .high = wmarks_high,
- .mid = wmarks_mid,
- .low = wmarks_low,
- };
- struct damos_quota quota = {
- /*
- * Do not try LRU-lists sorting of hot pages for more than half
- * of quota_ms milliseconds within quota_reset_interval_ms.
- */
- .ms = quota_ms / 2,
- .sz = 0,
- .reset_interval = quota_reset_interval_ms,
- /* Within the quota, mark hotter regions accessed first. */
- .weight_sz = 0,
- .weight_nr_accesses = 1,
- .weight_age = 0,
- };
- struct damos *scheme = damon_new_scheme(
- /* Find regions having PAGE_SIZE or larger size */
- PAGE_SIZE, ULONG_MAX,
- /* and accessed for more than the threshold */
- hot_thres, UINT_MAX,
- /* no matter its age */
- 0, UINT_MAX,
- /* prioritize those on LRU lists, as soon as found */
- DAMOS_LRU_PRIO,
- /* under the quota. */
- &quota,
- /* (De)activate this according to the watermarks. */
- &wmarks);
+ struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
- return scheme;
+ pattern.min_nr_accesses = hot_thres;
+ return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO);
}
/* Create a DAMON-based operation scheme for cold memory regions */
static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
{
- struct damos_watermarks wmarks = {
- .metric = DAMOS_WMARK_FREE_MEM_RATE,
- .interval = wmarks_interval,
- .high = wmarks_high,
- .mid = wmarks_mid,
- .low = wmarks_low,
- };
- struct damos_quota quota = {
- /*
- * Do not try LRU-lists sorting of cold pages for more than
- * half of quota_ms milliseconds within
- * quota_reset_interval_ms.
- */
- .ms = quota_ms / 2,
- .sz = 0,
- .reset_interval = quota_reset_interval_ms,
- /* Within the quota, mark colder regions not accessed first. */
- .weight_sz = 0,
- .weight_nr_accesses = 0,
- .weight_age = 1,
- };
- struct damos *scheme = damon_new_scheme(
- /* Find regions having PAGE_SIZE or larger size */
- PAGE_SIZE, ULONG_MAX,
- /* and not accessed at all */
- 0, 0,
- /* for cold_thres or more micro-seconds, and */
- cold_thres, UINT_MAX,
- /* mark those as not accessed, as soon as found */
- DAMOS_LRU_DEPRIO,
- /* under the quota. */
- &quota,
- /* (De)activate this according to the watermarks. */
- &wmarks);
+ struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
- return scheme;
+ pattern.max_nr_accesses = 0;
+ pattern.min_age_region = cold_thres;
+ return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme, *next_scheme;
- struct damon_addr_range addr_range;
+ struct damos *scheme;
unsigned int hot_thres, cold_thres;
int err = 0;
- err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
- min_nr_regions, max_nr_regions);
+ err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
if (err)
return err;
- /* free previously set schemes */
- damon_for_each_scheme_safe(scheme, next_scheme, ctx)
- damon_destroy_scheme(scheme);
-
/* aggr_interval / sample_interval is the maximum nr_accesses */
- hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
- 1000;
+ hot_thres = damon_lru_sort_mon_attrs.aggr_interval /
+ damon_lru_sort_mon_attrs.sample_interval *
+ hot_thres_access_freq / 1000;
scheme = damon_lru_sort_new_hot_scheme(hot_thres);
if (!scheme)
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ damon_set_schemes(ctx, &scheme, 1);
- cold_thres = cold_min_age / aggr_interval;
+ cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
scheme = damon_lru_sort_new_cold_scheme(cold_thres);
if (!scheme)
return -ENOMEM;
damon_add_scheme(ctx, scheme);
- if (monitor_region_start > monitor_region_end)
- return -EINVAL;
- if (!monitor_region_start && !monitor_region_end &&
- !get_monitoring_region(&monitor_region_start,
- &monitor_region_end))
- return -EINVAL;
- addr_range.start = monitor_region_start;
- addr_range.end = monitor_region_end;
- return damon_set_regions(target, &addr_range, 1);
+ return damon_set_region_biggest_system_ram_default(target,
+ &monitor_region_start,
+ &monitor_region_end);
}
static int damon_lru_sort_turn(bool on)
@@ -434,38 +235,31 @@ static int damon_lru_sort_turn(bool on)
return 0;
}
-static struct delayed_work damon_lru_sort_timer;
-static void damon_lru_sort_timer_fn(struct work_struct *work)
-{
- static bool last_enabled;
- bool now_enabled;
-
- now_enabled = enabled;
- if (last_enabled != now_enabled) {
- if (!damon_lru_sort_turn(now_enabled))
- last_enabled = now_enabled;
- else
- enabled = last_enabled;
- }
-}
-static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
-
-static bool damon_lru_sort_initialized;
-
static int damon_lru_sort_enabled_store(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
+ bool is_enabled = enabled;
+ bool enable;
+ int err;
- if (rc < 0)
- return rc;
+ err = kstrtobool(val, &enable);
+ if (err)
+ return err;
+
+ if (is_enabled == enable)
+ return 0;
- if (!damon_lru_sort_initialized)
- return rc;
+ /* Called before init function. The function will handle this. */
+ if (!ctx)
+ goto set_param_out;
- schedule_delayed_work(&damon_lru_sort_timer, 0);
+ err = damon_lru_sort_turn(enable);
+ if (err)
+ return err;
- return 0;
+set_param_out:
+ enabled = enable;
+ return err;
}
static const struct kernel_param_ops enabled_param_ops = {
@@ -495,19 +289,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
/* update the stats parameter */
damon_for_each_scheme(s, c) {
- if (s->action == DAMOS_LRU_PRIO) {
- nr_lru_sort_tried_hot_regions = s->stat.nr_tried;
- bytes_lru_sort_tried_hot_regions = s->stat.sz_tried;
- nr_lru_sorted_hot_regions = s->stat.nr_applied;
- bytes_lru_sorted_hot_regions = s->stat.sz_applied;
- nr_hot_quota_exceeds = s->stat.qt_exceeds;
- } else if (s->action == DAMOS_LRU_DEPRIO) {
- nr_lru_sort_tried_cold_regions = s->stat.nr_tried;
- bytes_lru_sort_tried_cold_regions = s->stat.sz_tried;
- nr_lru_sorted_cold_regions = s->stat.nr_applied;
- bytes_lru_sorted_cold_regions = s->stat.sz_applied;
- nr_cold_quota_exceeds = s->stat.qt_exceeds;
- }
+ if (s->action == DAMOS_LRU_PRIO)
+ damon_lru_sort_hot_stat = s->stat;
+ else if (s->action == DAMOS_LRU_DEPRIO)
+ damon_lru_sort_cold_stat = s->stat;
}
return damon_lru_sort_handle_commit_inputs();
@@ -520,29 +305,19 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
static int __init damon_lru_sort_init(void)
{
- ctx = damon_new_ctx();
- if (!ctx)
- return -ENOMEM;
+ int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
- if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return -EINVAL;
- }
+ if (err)
+ return err;
ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
- target = damon_new_target();
- if (!target) {
- damon_destroy_ctx(ctx);
- return -ENOMEM;
- }
- damon_add_target(ctx, target);
+ /* 'enabled' has set before this function, probably via command line */
+ if (enabled)
+ err = damon_lru_sort_turn(true);
- schedule_delayed_work(&damon_lru_sort_timer, 0);
-
- damon_lru_sort_initialized = true;
- return 0;
+ return err;
}
module_init(damon_lru_sort_init);
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
new file mode 100644
index 000000000000..b2381a8466ec
--- /dev/null
+++ b/mm/damon/modules-common.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#include <linux/damon.h>
+
+#include "modules-common.h"
+
+/*
+ * Allocate, set, and return a DAMON context for the physical address space.
+ * @ctxp: Pointer to save the point to the newly created context
+ * @targetp: Pointer to save the point to the newly created target
+ */
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+ struct damon_target **targetp)
+{
+ struct damon_ctx *ctx;
+ struct damon_target *target;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+
+ *ctxp = ctx;
+ *targetp = target;
+ return 0;
+}
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
new file mode 100644
index 000000000000..f49cdb417005
--- /dev/null
+++ b/mm/damon/modules-common.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/moduleparam.h>
+
+#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \
+ module_param_named(sample_interval, attrs.sample_interval, \
+ ulong, 0600); \
+ module_param_named(aggr_interval, attrs.aggr_interval, ulong, \
+ 0600); \
+ module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \
+ 0600); \
+ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \
+ 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \
+ module_param_named(quota_ms, quota.ms, ulong, 0600); \
+ module_param_named(quota_reset_interval_ms, \
+ quota.reset_interval, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \
+ DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \
+ module_param_named(quota_sz, quota.sz, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \
+ module_param_named(wmarks_interval, wmarks.interval, ulong, \
+ 0600); \
+ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \
+ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \
+ module_param_named(wmarks_low, wmarks.low, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \
+ succ_name, qt_exceed_name) \
+ module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \
+ module_param_named(bytes_##try_name, stat.sz_tried, ulong, \
+ 0400); \
+ module_param_named(nr_##succ_name, stat.nr_applied, ulong, \
+ 0400); \
+ module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \
+ 0400); \
+ module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \
+ 0400);
+
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+ struct damon_target **targetp);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index b1335de200e7..75409601f934 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -88,7 +88,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
#define DAMON_MAX_SUBSCORE (100)
#define DAMON_MAX_AGE_IN_LOG (32)
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
unsigned int max_nr_accesses;
@@ -99,10 +99,10 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
unsigned int age_weight = s->quota.weight_age;
int hotness;
- max_nr_accesses = c->aggr_interval / c->sample_interval;
+ max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval;
freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
- age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+ age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
age_in_log++, age_in_sec >>= 1)
;
@@ -127,48 +127,14 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
*/
hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
- /* Return coldness of the region */
- return DAMOS_MAX_SCORE - hotness;
+ return hotness;
}
-int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
- unsigned int max_nr_accesses;
- int freq_subscore;
- unsigned int age_in_sec;
- int age_in_log, age_subscore;
- unsigned int freq_weight = s->quota.weight_nr_accesses;
- unsigned int age_weight = s->quota.weight_age;
- int hotness;
-
- max_nr_accesses = c->aggr_interval / c->sample_interval;
- freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
-
- age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
- for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
- age_in_log++, age_in_sec >>= 1)
- ;
+ int hotness = damon_hot_score(c, r, s);
- /* If frequency is 0, higher age means it's colder */
- if (freq_subscore == 0)
- age_in_log *= -1;
-
- /*
- * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
- * Scale it to be in [0, 100] and set it as age subscore.
- */
- age_in_log += DAMON_MAX_AGE_IN_LOG;
- age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
- DAMON_MAX_AGE_IN_LOG / 2;
-
- hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
- if (freq_weight + age_weight)
- hotness /= freq_weight + age_weight;
- /*
- * Transform it to fit in [0, DAMOS_MAX_SCORE]
- */
- hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
-
- return hotness;
+ /* Return coldness of the region */
+ return DAMOS_MAX_SCORE - hotness;
}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 52329ff361cd..8d82d3722204 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index dc131c6a5403..e1a4315c4be6 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -63,8 +63,7 @@ out:
folio_put(folio);
}
-static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
- struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@@ -78,7 +77,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(ctx, r);
+ __damon_pa_prepare_access_check(r);
}
}
@@ -166,8 +165,7 @@ out:
return result.accessed;
}
-static void __damon_pa_check_access(struct damon_ctx *ctx,
- struct damon_region *r)
+static void __damon_pa_check_access(struct damon_region *r)
{
static unsigned long last_addr;
static unsigned long last_page_sz = PAGE_SIZE;
@@ -196,7 +194,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(ctx, r);
+ __damon_pa_check_access(r);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
@@ -233,7 +231,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static inline unsigned long damon_pa_mark_accessed_or_deactivate(
+ struct damon_region *r, bool mark_accessed)
{
unsigned long addr, applied = 0;
@@ -242,27 +241,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r)
if (!page)
continue;
- mark_page_accessed(page);
+ if (mark_accessed)
+ mark_page_accessed(page);
+ else
+ deactivate_page(page);
put_page(page);
applied++;
}
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
{
- unsigned long addr, applied = 0;
-
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ return damon_pa_mark_accessed_or_deactivate(r, true);
+}
- if (!page)
- continue;
- deactivate_page(page);
- put_page(page);
- applied++;
- }
- return applied * PAGE_SIZE;
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+ return damon_pa_mark_accessed_or_deactivate(r, false);
}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
@@ -276,7 +272,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
return damon_pa_mark_accessed(r);
case DAMOS_LRU_DEPRIO:
return damon_pa_deactivate_pages(r);
+ case DAMOS_STAT:
+ break;
default:
+ /* DAMOS actions that not yet supported by 'paddr'. */
break;
}
return 0;
@@ -288,11 +287,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pageout_score(context, r, scheme);
+ return damon_cold_score(context, r, scheme);
case DAMOS_LRU_PRIO:
return damon_hot_score(context, r, scheme);
case DAMOS_LRU_DEPRIO:
- return damon_pageout_score(context, r, scheme);
+ return damon_cold_score(context, r, scheme);
default:
break;
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index a7faf51b4bd4..e82631f39481 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -8,10 +8,10 @@
#define pr_fmt(fmt) "damon-reclaim: " fmt
#include <linux/damon.h>
-#include <linux/ioport.h>
+#include <linux/kstrtox.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
+
+#include "modules-common.h"
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
@@ -50,124 +50,35 @@ module_param(commit_inputs, bool, 0600);
static unsigned long min_age __read_mostly = 120000000;
module_param(min_age, ulong, 0600);
-/*
- * Limit of time for trying the reclamation in milliseconds.
- *
- * DAMON_RECLAIM tries to use only up to this time within a time window
- * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be
- * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero,
- * the limit is disabled.
- *
- * 10 ms by default.
- */
-static unsigned long quota_ms __read_mostly = 10;
-module_param(quota_ms, ulong, 0600);
-
-/*
- * Limit of size of memory for the reclamation in bytes.
- *
- * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a
- * time window (quota_reset_interval_ms) and makes no more than this limit is
- * tried. This can be used for limiting consumption of CPU and IO. If this
- * value is zero, the limit is disabled.
- *
- * 128 MiB by default.
- */
-static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024;
-module_param(quota_sz, ulong, 0600);
-
-/*
- * The time/size quota charge reset interval in milliseconds.
- *
- * The charge reset interval for the quota of time (quota_ms) and size
- * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than
- * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
- * milliseconds.
- *
- * 1 second by default.
- */
-static unsigned long quota_reset_interval_ms __read_mostly = 1000;
-module_param(quota_reset_interval_ms, ulong, 0600);
-
-/*
- * The watermarks check time interval in microseconds.
- *
- * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
- * enabled but inactive due to its watermarks rule. 5 seconds by default.
- */
-static unsigned long wmarks_interval __read_mostly = 5000000;
-module_param(wmarks_interval, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the high watermark.
- *
- * If free memory of the system in bytes per thousand bytes is higher than
- * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically
- * checks the watermarks. 500 (50%) by default.
- */
-static unsigned long wmarks_high __read_mostly = 500;
-module_param(wmarks_high, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the middle watermark.
- *
- * If free memory of the system in bytes per thousand bytes is between this and
- * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring
- * and the reclaiming. 400 (40%) by default.
- */
-static unsigned long wmarks_mid __read_mostly = 400;
-module_param(wmarks_mid, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the low watermark.
- *
- * If free memory of the system in bytes per thousand bytes is lower than this,
- * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks
- * the watermarks. In the case, the system falls back to the LRU-based page
- * granularity reclamation logic. 200 (20%) by default.
- */
-static unsigned long wmarks_low __read_mostly = 200;
-module_param(wmarks_low, ulong, 0600);
-
-/*
- * Sampling interval for the monitoring in microseconds.
- *
- * The sampling interval of DAMON for the cold memory monitoring. Please refer
- * to the DAMON documentation for more detail. 5 ms by default.
- */
-static unsigned long sample_interval __read_mostly = 5000;
-module_param(sample_interval, ulong, 0600);
-
-/*
- * Aggregation interval for the monitoring in microseconds.
- *
- * The aggregation interval of DAMON for the cold memory monitoring. Please
- * refer to the DAMON documentation for more detail. 100 ms by default.
- */
-static unsigned long aggr_interval __read_mostly = 100000;
-module_param(aggr_interval, ulong, 0600);
-
-/*
- * Minimum number of monitoring regions.
- *
- * The minimal number of monitoring regions of DAMON for the cold memory
- * monitoring. This can be used to set lower-bound of the monitoring quality.
- * But, setting this too high could result in increased monitoring overhead.
- * Please refer to the DAMON documentation for more detail. 10 by default.
- */
-static unsigned long min_nr_regions __read_mostly = 10;
-module_param(min_nr_regions, ulong, 0600);
-
-/*
- * Maximum number of monitoring regions.
- *
- * The maximum number of monitoring regions of DAMON for the cold memory
- * monitoring. This can be used to set upper-bound of the monitoring overhead.
- * However, setting this too low could result in bad monitoring quality.
- * Please refer to the DAMON documentation for more detail. 1000 by default.
- */
-static unsigned long max_nr_regions __read_mostly = 1000;
-module_param(max_nr_regions, ulong, 0600);
+static struct damos_quota damon_reclaim_quota = {
+ /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */
+ .ms = 10,
+ .sz = 128 * 1024 * 1024,
+ .reset_interval = 1000,
+ /* Within the quota, page out older regions first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 0,
+ .weight_age = 1
+};
+DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
+
+static struct damos_watermarks damon_reclaim_wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = 5000000, /* 5 seconds */
+ .high = 500, /* 50 percent */
+ .mid = 400, /* 40 percent */
+ .low = 200, /* 20 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks);
+
+static struct damon_attrs damon_reclaim_mon_attrs = {
+ .sample_interval = 5000, /* 5 ms */
+ .aggr_interval = 100000, /* 100 ms */
+ .ops_update_interval = 0,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+};
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
/*
* Start of the target memory region in physical address.
@@ -196,119 +107,44 @@ module_param(monitor_region_end, ulong, 0600);
static int kdamond_pid __read_mostly = -1;
module_param(kdamond_pid, int, 0400);
-/*
- * Number of memory regions that tried to be reclaimed.
- */
-static unsigned long nr_reclaim_tried_regions __read_mostly;
-module_param(nr_reclaim_tried_regions, ulong, 0400);
-
-/*
- * Total bytes of memory regions that tried to be reclaimed.
- */
-static unsigned long bytes_reclaim_tried_regions __read_mostly;
-module_param(bytes_reclaim_tried_regions, ulong, 0400);
-
-/*
- * Number of memory regions that successfully be reclaimed.
- */
-static unsigned long nr_reclaimed_regions __read_mostly;
-module_param(nr_reclaimed_regions, ulong, 0400);
-
-/*
- * Total bytes of memory regions that successfully be reclaimed.
- */
-static unsigned long bytes_reclaimed_regions __read_mostly;
-module_param(bytes_reclaimed_regions, ulong, 0400);
-
-/*
- * Number of times that the time/space quota limits have exceeded
- */
-static unsigned long nr_quota_exceeds __read_mostly;
-module_param(nr_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_reclaim_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
+ reclaim_tried_regions, reclaimed_regions, quota_exceeds);
static struct damon_ctx *ctx;
static struct damon_target *target;
-struct damon_reclaim_ram_walk_arg {
- unsigned long start;
- unsigned long end;
-};
-
-static int walk_system_ram(struct resource *res, void *arg)
-{
- struct damon_reclaim_ram_walk_arg *a = arg;
-
- if (a->end - a->start < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
- }
- return 0;
-}
-
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool get_monitoring_region(unsigned long *start, unsigned long *end)
-{
- struct damon_reclaim_ram_walk_arg arg = {};
-
- walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
- if (arg.end <= arg.start)
- return false;
-
- *start = arg.start;
- *end = arg.end;
- return true;
-}
-
static struct damos *damon_reclaim_new_scheme(void)
{
- struct damos_watermarks wmarks = {
- .metric = DAMOS_WMARK_FREE_MEM_RATE,
- .interval = wmarks_interval,
- .high = wmarks_high,
- .mid = wmarks_mid,
- .low = wmarks_low,
+ struct damos_access_pattern pattern = {
+ /* Find regions having PAGE_SIZE or larger size */
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ /* and not accessed at all */
+ .min_nr_accesses = 0,
+ .max_nr_accesses = 0,
+ /* for min_age or more micro-seconds */
+ .min_age_region = min_age /
+ damon_reclaim_mon_attrs.aggr_interval,
+ .max_age_region = UINT_MAX,
};
- struct damos_quota quota = {
- /*
- * Do not try reclamation for more than quota_ms milliseconds
- * or quota_sz bytes within quota_reset_interval_ms.
- */
- .ms = quota_ms,
- .sz = quota_sz,
- .reset_interval = quota_reset_interval_ms,
- /* Within the quota, page out older regions first. */
- .weight_sz = 0,
- .weight_nr_accesses = 0,
- .weight_age = 1
- };
- struct damos *scheme = damon_new_scheme(
- /* Find regions having PAGE_SIZE or larger size */
- PAGE_SIZE, ULONG_MAX,
- /* and not accessed at all */
- 0, 0,
- /* for min_age or more micro-seconds, and */
- min_age / aggr_interval, UINT_MAX,
+
+ return damon_new_scheme(
+ &pattern,
/* page out those, as soon as found */
DAMOS_PAGEOUT,
/* under the quota. */
- &quota,
+ &damon_reclaim_quota,
/* (De)activate this according to the watermarks. */
- &wmarks);
-
- return scheme;
+ &damon_reclaim_wmarks);
}
static int damon_reclaim_apply_parameters(void)
{
struct damos *scheme;
- struct damon_addr_range addr_range;
int err = 0;
- err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
- min_nr_regions, max_nr_regions);
+ err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
if (err)
return err;
@@ -316,19 +152,11 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
- err = damon_set_schemes(ctx, &scheme, 1);
- if (err)
- return err;
+ damon_set_schemes(ctx, &scheme, 1);
- if (monitor_region_start > monitor_region_end)
- return -EINVAL;
- if (!monitor_region_start && !monitor_region_end &&
- !get_monitoring_region(&monitor_region_start,
- &monitor_region_end))
- return -EINVAL;
- addr_range.start = monitor_region_start;
- addr_range.end = monitor_region_end;
- return damon_set_regions(target, &addr_range, 1);
+ return damon_set_region_biggest_system_ram_default(target,
+ &monitor_region_start,
+ &monitor_region_end);
}
static int damon_reclaim_turn(bool on)
@@ -353,38 +181,31 @@ static int damon_reclaim_turn(bool on)
return 0;
}
-static struct delayed_work damon_reclaim_timer;
-static void damon_reclaim_timer_fn(struct work_struct *work)
-{
- static bool last_enabled;
- bool now_enabled;
-
- now_enabled = enabled;
- if (last_enabled != now_enabled) {
- if (!damon_reclaim_turn(now_enabled))
- last_enabled = now_enabled;
- else
- enabled = last_enabled;
- }
-}
-static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
-
-static bool damon_reclaim_initialized;
-
static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
+ bool is_enabled = enabled;
+ bool enable;
+ int err;
+
+ err = kstrtobool(val, &enable);
+ if (err)
+ return err;
- if (rc < 0)
- return rc;
+ if (is_enabled == enable)
+ return 0;
- /* system_wq might not initialized yet */
- if (!damon_reclaim_initialized)
- return rc;
+ /* Called before init function. The function will handle this. */
+ if (!ctx)
+ goto set_param_out;
- schedule_delayed_work(&damon_reclaim_timer, 0);
- return 0;
+ err = damon_reclaim_turn(enable);
+ if (err)
+ return err;
+
+set_param_out:
+ enabled = enable;
+ return err;
}
static const struct kernel_param_ops enabled_param_ops = {
@@ -413,13 +234,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
struct damos *s;
/* update the stats parameter */
- damon_for_each_scheme(s, c) {
- nr_reclaim_tried_regions = s->stat.nr_tried;
- bytes_reclaim_tried_regions = s->stat.sz_tried;
- nr_reclaimed_regions = s->stat.nr_applied;
- bytes_reclaimed_regions = s->stat.sz_applied;
- nr_quota_exceeds = s->stat.qt_exceeds;
- }
+ damon_for_each_scheme(s, c)
+ damon_reclaim_stat = s->stat;
return damon_reclaim_handle_commit_inputs();
}
@@ -431,29 +247,19 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
static int __init damon_reclaim_init(void)
{
- ctx = damon_new_ctx();
- if (!ctx)
- return -ENOMEM;
+ int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
- if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return -EINVAL;
- }
+ if (err)
+ return err;
ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
- target = damon_new_target();
- if (!target) {
- damon_destroy_ctx(ctx);
- return -ENOMEM;
- }
- damon_add_target(ctx, target);
-
- schedule_delayed_work(&damon_reclaim_timer, 0);
+ /* 'enabled' has set before this function, probably via command line */
+ if (enabled)
+ err = damon_reclaim_turn(true);
- damon_reclaim_initialized = true;
- return 0;
+ return err;
}
module_init(damon_reclaim_init);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
new file mode 100644
index 000000000000..52bebf242f74
--- /dev/null
+++ b/mm/damon/sysfs-common.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+DEFINE_MUTEX(damon_sysfs_lock);
+
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max)
+{
+ struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+ GFP_KERNEL);
+
+ if (!range)
+ return NULL;
+ range->kobj = (struct kobject){};
+ range->min = min;
+ range->max = max;
+
+ return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long min;
+ int err;
+
+ err = kstrtoul(buf, 0, &min);
+ if (err)
+ return err;
+
+ range->min = min;
+ return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long max;
+ int err;
+
+ err = kstrtoul(buf, 0, &max);
+ if (err)
+ return err;
+
+ range->max = max;
+ return count;
+}
+
+void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+ &damon_sysfs_ul_range_min_attr.attr,
+ &damon_sysfs_ul_range_max_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+struct kobj_type damon_sysfs_ul_range_ktype = {
+ .release = damon_sysfs_ul_range_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_ul_range_groups,
+};
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
new file mode 100644
index 000000000000..604a6cbc3ede
--- /dev/null
+++ b/mm/damon/sysfs-common.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+
+extern struct mutex damon_sysfs_lock;
+
+struct damon_sysfs_ul_range {
+ struct kobject kobj;
+ unsigned long min;
+ unsigned long max;
+};
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max);
+void damon_sysfs_ul_range_release(struct kobject *kobj);
+
+extern struct kobj_type damon_sysfs_ul_range_ktype;
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes {
+ struct kobject kobj;
+ struct damon_sysfs_scheme **schemes_arr;
+ int nr;
+};
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void);
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes);
+
+extern struct kobj_type damon_sysfs_schemes_ktype;
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+ struct damon_sysfs_schemes *sysfs_schemes);
+
+void damon_sysfs_schemes_update_stats(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_start(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_clear_regions(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
new file mode 100644
index 000000000000..81fc4d27f4e4
--- /dev/null
+++ b/mm/damon/sysfs-schemes.c
@@ -0,0 +1,1338 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+/*
+ * scheme region directory
+ */
+
+struct damon_sysfs_scheme_region {
+ struct kobject kobj;
+ struct damon_addr_range ar;
+ unsigned int nr_accesses;
+ unsigned int age;
+ struct list_head list;
+};
+
+static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc(
+ struct damon_region *region)
+{
+ struct damon_sysfs_scheme_region *sysfs_region = kmalloc(
+ sizeof(*sysfs_region), GFP_KERNEL);
+
+ if (!sysfs_region)
+ return NULL;
+ sysfs_region->kobj = (struct kobject){};
+ sysfs_region->ar = region->ar;
+ sysfs_region->nr_accesses = region->nr_accesses;
+ sysfs_region->age = region->age;
+ INIT_LIST_HEAD(&sysfs_region->list);
+ return sysfs_region;
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->ar.start);
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->ar.end);
+}
+
+static ssize_t nr_accesses_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%u\n", region->nr_accesses);
+}
+
+static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%u\n", region->age);
+}
+
+static void damon_sysfs_scheme_region_release(struct kobject *kobj)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ list_del(&region->list);
+ kfree(region);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_region_start_attr =
+ __ATTR_RO_MODE(start, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_end_attr =
+ __ATTR_RO_MODE(end, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
+ __ATTR_RO_MODE(nr_accesses, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
+ __ATTR_RO_MODE(age, 0400);
+
+static struct attribute *damon_sysfs_scheme_region_attrs[] = {
+ &damon_sysfs_scheme_region_start_attr.attr,
+ &damon_sysfs_scheme_region_end_attr.attr,
+ &damon_sysfs_scheme_region_nr_accesses_attr.attr,
+ &damon_sysfs_scheme_region_age_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
+
+static struct kobj_type damon_sysfs_scheme_region_ktype = {
+ .release = damon_sysfs_scheme_region_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_region_groups,
+};
+
+/*
+ * scheme regions directory
+ */
+
+struct damon_sysfs_scheme_regions {
+ struct kobject kobj;
+ struct list_head regions_list;
+ int nr_regions;
+};
+
+static struct damon_sysfs_scheme_regions *
+damon_sysfs_scheme_regions_alloc(void)
+{
+ struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
+ GFP_KERNEL);
+
+ regions->kobj = (struct kobject){};
+ INIT_LIST_HEAD(&regions->regions_list);
+ regions->nr_regions = 0;
+ return regions;
+}
+
+static void damon_sysfs_scheme_regions_rm_dirs(
+ struct damon_sysfs_scheme_regions *regions)
+{
+ struct damon_sysfs_scheme_region *r, *next;
+
+ list_for_each_entry_safe(r, next, &regions->regions_list, list) {
+ /* release function deletes it from the list */
+ kobject_put(&r->kobj);
+ regions->nr_regions--;
+ }
+}
+
+static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
+}
+
+static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
+
+static struct kobj_type damon_sysfs_scheme_regions_ktype = {
+ .release = damon_sysfs_scheme_regions_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_regions_groups,
+};
+
+/*
+ * schemes/stats directory
+ */
+
+struct damon_sysfs_stats {
+ struct kobject kobj;
+ unsigned long nr_tried;
+ unsigned long sz_tried;
+ unsigned long nr_applied;
+ unsigned long sz_applied;
+ unsigned long qt_exceeds;
+};
+
+static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
+}
+
+static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_tried);
+}
+
+static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_tried);
+}
+
+static ssize_t nr_applied_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_applied);
+}
+
+static ssize_t sz_applied_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_applied);
+}
+
+static ssize_t qt_exceeds_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
+}
+
+static void damon_sysfs_stats_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
+ __ATTR_RO_MODE(nr_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
+ __ATTR_RO_MODE(sz_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
+ __ATTR_RO_MODE(nr_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
+ __ATTR_RO_MODE(sz_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
+ __ATTR_RO_MODE(qt_exceeds, 0400);
+
+static struct attribute *damon_sysfs_stats_attrs[] = {
+ &damon_sysfs_stats_nr_tried_attr.attr,
+ &damon_sysfs_stats_sz_tried_attr.attr,
+ &damon_sysfs_stats_nr_applied_attr.attr,
+ &damon_sysfs_stats_sz_applied_attr.attr,
+ &damon_sysfs_stats_qt_exceeds_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_stats);
+
+static struct kobj_type damon_sysfs_stats_ktype = {
+ .release = damon_sysfs_stats_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_stats_groups,
+};
+
+/*
+ * watermarks directory
+ */
+
+struct damon_sysfs_watermarks {
+ struct kobject kobj;
+ enum damos_wmark_metric metric;
+ unsigned long interval_us;
+ unsigned long high;
+ unsigned long mid;
+ unsigned long low;
+};
+
+static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
+ enum damos_wmark_metric metric, unsigned long interval_us,
+ unsigned long high, unsigned long mid, unsigned long low)
+{
+ struct damon_sysfs_watermarks *watermarks = kmalloc(
+ sizeof(*watermarks), GFP_KERNEL);
+
+ if (!watermarks)
+ return NULL;
+ watermarks->kobj = (struct kobject){};
+ watermarks->metric = metric;
+ watermarks->interval_us = interval_us;
+ watermarks->high = high;
+ watermarks->mid = mid;
+ watermarks->low = low;
+ return watermarks;
+}
+
+/* Should match with enum damos_wmark_metric */
+static const char * const damon_sysfs_wmark_metric_strs[] = {
+ "none",
+ "free_mem_rate",
+};
+
+static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_wmark_metric_strs[watermarks->metric]);
+}
+
+static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ enum damos_wmark_metric metric;
+
+ for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
+ if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
+ watermarks->metric = metric;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t interval_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
+}
+
+static ssize_t interval_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->interval_us);
+
+ return err ? err : count;
+}
+
+static ssize_t high_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->high);
+}
+
+static ssize_t high_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->high);
+
+ return err ? err : count;
+}
+
+static ssize_t mid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->mid);
+}
+
+static ssize_t mid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->mid);
+
+ return err ? err : count;
+}
+
+static ssize_t low_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->low);
+}
+
+static ssize_t low_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->low);
+
+ return err ? err : count;
+}
+
+static void damon_sysfs_watermarks_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
+ __ATTR_RW_MODE(metric, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
+ __ATTR_RW_MODE(interval_us, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_high_attr =
+ __ATTR_RW_MODE(high, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
+ __ATTR_RW_MODE(mid, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_low_attr =
+ __ATTR_RW_MODE(low, 0600);
+
+static struct attribute *damon_sysfs_watermarks_attrs[] = {
+ &damon_sysfs_watermarks_metric_attr.attr,
+ &damon_sysfs_watermarks_interval_us_attr.attr,
+ &damon_sysfs_watermarks_high_attr.attr,
+ &damon_sysfs_watermarks_mid_attr.attr,
+ &damon_sysfs_watermarks_low_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
+
+static struct kobj_type damon_sysfs_watermarks_ktype = {
+ .release = damon_sysfs_watermarks_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_watermarks_groups,
+};
+
+/*
+ * scheme/weights directory
+ */
+
+struct damon_sysfs_weights {
+ struct kobject kobj;
+ unsigned int sz;
+ unsigned int nr_accesses;
+ unsigned int age;
+};
+
+static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
+ unsigned int nr_accesses, unsigned int age)
+{
+ struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
+ GFP_KERNEL);
+
+ if (!weights)
+ return NULL;
+ weights->kobj = (struct kobject){};
+ weights->sz = sz;
+ weights->nr_accesses = nr_accesses;
+ weights->age = age;
+ return weights;
+}
+
+static ssize_t sz_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->sz);
+}
+
+static ssize_t sz_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->sz);
+
+ return err ? err : count;
+}
+
+static ssize_t nr_accesses_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->nr_accesses);
+}
+
+static ssize_t nr_accesses_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->nr_accesses);
+
+ return err ? err : count;
+}
+
+static ssize_t age_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->age);
+}
+
+static ssize_t age_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->age);
+
+ return err ? err : count;
+}
+
+static void damon_sysfs_weights_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_weights_sz_attr =
+ __ATTR_RW_MODE(sz_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
+ __ATTR_RW_MODE(nr_accesses_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_age_attr =
+ __ATTR_RW_MODE(age_permil, 0600);
+
+static struct attribute *damon_sysfs_weights_attrs[] = {
+ &damon_sysfs_weights_sz_attr.attr,
+ &damon_sysfs_weights_nr_accesses_attr.attr,
+ &damon_sysfs_weights_age_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_weights);
+
+static struct kobj_type damon_sysfs_weights_ktype = {
+ .release = damon_sysfs_weights_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_weights_groups,
+};
+
+/*
+ * quotas directory
+ */
+
+struct damon_sysfs_quotas {
+ struct kobject kobj;
+ struct damon_sysfs_weights *weights;
+ unsigned long ms;
+ unsigned long sz;
+ unsigned long reset_interval_ms;
+};
+
+static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
+}
+
+static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
+{
+ struct damon_sysfs_weights *weights;
+ int err;
+
+ weights = damon_sysfs_weights_alloc(0, 0, 0);
+ if (!weights)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
+ &quotas->kobj, "weights");
+ if (err)
+ kobject_put(&weights->kobj);
+ else
+ quotas->weights = weights;
+ return err;
+}
+
+static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
+{
+ kobject_put(&quotas->weights->kobj);
+}
+
+static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->ms);
+}
+
+static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->ms);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->sz);
+}
+
+static ssize_t bytes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->sz);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t reset_interval_ms_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
+}
+
+static ssize_t reset_interval_ms_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_quotas_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_quotas_ms_attr =
+ __ATTR_RW_MODE(ms, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_sz_attr =
+ __ATTR_RW_MODE(bytes, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
+ __ATTR_RW_MODE(reset_interval_ms, 0600);
+
+static struct attribute *damon_sysfs_quotas_attrs[] = {
+ &damon_sysfs_quotas_ms_attr.attr,
+ &damon_sysfs_quotas_sz_attr.attr,
+ &damon_sysfs_quotas_reset_interval_ms_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_quotas);
+
+static struct kobj_type damon_sysfs_quotas_ktype = {
+ .release = damon_sysfs_quotas_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_quotas_groups,
+};
+
+/*
+ * access_pattern directory
+ */
+
+struct damon_sysfs_access_pattern {
+ struct kobject kobj;
+ struct damon_sysfs_ul_range *sz;
+ struct damon_sysfs_ul_range *nr_accesses;
+ struct damon_sysfs_ul_range *age;
+};
+
+static
+struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ kmalloc(sizeof(*access_pattern), GFP_KERNEL);
+
+ if (!access_pattern)
+ return NULL;
+ access_pattern->kobj = (struct kobject){};
+ return access_pattern;
+}
+
+static int damon_sysfs_access_pattern_add_range_dir(
+ struct damon_sysfs_access_pattern *access_pattern,
+ struct damon_sysfs_ul_range **range_dir_ptr,
+ char *name)
+{
+ struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
+ int err;
+
+ if (!range)
+ return -ENOMEM;
+ err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
+ &access_pattern->kobj, name);
+ if (err)
+ kobject_put(&range->kobj);
+ else
+ *range_dir_ptr = range;
+ return err;
+}
+
+static int damon_sysfs_access_pattern_add_dirs(
+ struct damon_sysfs_access_pattern *access_pattern)
+{
+ int err;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->sz, "sz");
+ if (err)
+ goto put_sz_out;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->nr_accesses, "nr_accesses");
+ if (err)
+ goto put_nr_accesses_sz_out;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->age, "age");
+ if (err)
+ goto put_age_nr_accesses_sz_out;
+ return 0;
+
+put_age_nr_accesses_sz_out:
+ kobject_put(&access_pattern->age->kobj);
+ access_pattern->age = NULL;
+put_nr_accesses_sz_out:
+ kobject_put(&access_pattern->nr_accesses->kobj);
+ access_pattern->nr_accesses = NULL;
+put_sz_out:
+ kobject_put(&access_pattern->sz->kobj);
+ access_pattern->sz = NULL;
+ return err;
+}
+
+static void damon_sysfs_access_pattern_rm_dirs(
+ struct damon_sysfs_access_pattern *access_pattern)
+{
+ kobject_put(&access_pattern->sz->kobj);
+ kobject_put(&access_pattern->nr_accesses->kobj);
+ kobject_put(&access_pattern->age->kobj);
+}
+
+static void damon_sysfs_access_pattern_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
+}
+
+static struct attribute *damon_sysfs_access_pattern_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
+
+static struct kobj_type damon_sysfs_access_pattern_ktype = {
+ .release = damon_sysfs_access_pattern_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_access_pattern_groups,
+};
+
+/*
+ * scheme directory
+ */
+
+struct damon_sysfs_scheme {
+ struct kobject kobj;
+ enum damos_action action;
+ struct damon_sysfs_access_pattern *access_pattern;
+ struct damon_sysfs_quotas *quotas;
+ struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_stats *stats;
+ struct damon_sysfs_scheme_regions *tried_regions;
+};
+
+/* This should match with enum damos_action */
+static const char * const damon_sysfs_damos_action_strs[] = {
+ "willneed",
+ "cold",
+ "pageout",
+ "hugepage",
+ "nohugepage",
+ "lru_prio",
+ "lru_deprio",
+ "stat",
+};
+
+static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
+ enum damos_action action)
+{
+ struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
+ GFP_KERNEL);
+
+ if (!scheme)
+ return NULL;
+ scheme->kobj = (struct kobject){};
+ scheme->action = action;
+ return scheme;
+}
+
+static int damon_sysfs_scheme_set_access_pattern(
+ struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern;
+ int err;
+
+ access_pattern = damon_sysfs_access_pattern_alloc();
+ if (!access_pattern)
+ return -ENOMEM;
+ err = kobject_init_and_add(&access_pattern->kobj,
+ &damon_sysfs_access_pattern_ktype, &scheme->kobj,
+ "access_pattern");
+ if (err)
+ goto out;
+ err = damon_sysfs_access_pattern_add_dirs(access_pattern);
+ if (err)
+ goto out;
+ scheme->access_pattern = access_pattern;
+ return 0;
+
+out:
+ kobject_put(&access_pattern->kobj);
+ return err;
+}
+
+static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
+ int err;
+
+ if (!quotas)
+ return -ENOMEM;
+ err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
+ &scheme->kobj, "quotas");
+ if (err)
+ goto out;
+ err = damon_sysfs_quotas_add_dirs(quotas);
+ if (err)
+ goto out;
+ scheme->quotas = quotas;
+ return 0;
+
+out:
+ kobject_put(&quotas->kobj);
+ return err;
+}
+
+static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_watermarks *watermarks =
+ damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
+ int err;
+
+ if (!watermarks)
+ return -ENOMEM;
+ err = kobject_init_and_add(&watermarks->kobj,
+ &damon_sysfs_watermarks_ktype, &scheme->kobj,
+ "watermarks");
+ if (err)
+ kobject_put(&watermarks->kobj);
+ else
+ scheme->watermarks = watermarks;
+ return err;
+}
+
+static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
+ int err;
+
+ if (!stats)
+ return -ENOMEM;
+ err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
+ &scheme->kobj, "stats");
+ if (err)
+ kobject_put(&stats->kobj);
+ else
+ scheme->stats = stats;
+ return err;
+}
+
+static int damon_sysfs_scheme_set_tried_regions(
+ struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_scheme_regions *tried_regions =
+ damon_sysfs_scheme_regions_alloc();
+ int err;
+
+ if (!tried_regions)
+ return -ENOMEM;
+ err = kobject_init_and_add(&tried_regions->kobj,
+ &damon_sysfs_scheme_regions_ktype, &scheme->kobj,
+ "tried_regions");
+ if (err)
+ kobject_put(&tried_regions->kobj);
+ else
+ scheme->tried_regions = tried_regions;
+ return err;
+}
+
+static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
+{
+ int err;
+
+ err = damon_sysfs_scheme_set_access_pattern(scheme);
+ if (err)
+ return err;
+ err = damon_sysfs_scheme_set_quotas(scheme);
+ if (err)
+ goto put_access_pattern_out;
+ err = damon_sysfs_scheme_set_watermarks(scheme);
+ if (err)
+ goto put_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_stats(scheme);
+ if (err)
+ goto put_watermarks_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_tried_regions(scheme);
+ if (err)
+ goto put_tried_regions_out;
+ return 0;
+
+put_tried_regions_out:
+ kobject_put(&scheme->tried_regions->kobj);
+ scheme->tried_regions = NULL;
+put_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->watermarks->kobj);
+ scheme->watermarks = NULL;
+put_quotas_access_pattern_out:
+ kobject_put(&scheme->quotas->kobj);
+ scheme->quotas = NULL;
+put_access_pattern_out:
+ kobject_put(&scheme->access_pattern->kobj);
+ scheme->access_pattern = NULL;
+ return err;
+}
+
+static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
+{
+ damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
+ kobject_put(&scheme->access_pattern->kobj);
+ damon_sysfs_quotas_rm_dirs(scheme->quotas);
+ kobject_put(&scheme->quotas->kobj);
+ kobject_put(&scheme->watermarks->kobj);
+ kobject_put(&scheme->stats->kobj);
+ damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
+ kobject_put(&scheme->tried_regions->kobj);
+}
+
+static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_damos_action_strs[scheme->action]);
+}
+
+static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+ enum damos_action action;
+
+ for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
+ if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
+ scheme->action = action;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static void damon_sysfs_scheme_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_action_attr =
+ __ATTR_RW_MODE(action, 0600);
+
+static struct attribute *damon_sysfs_scheme_attrs[] = {
+ &damon_sysfs_scheme_action_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme);
+
+static struct kobj_type damon_sysfs_scheme_ktype = {
+ .release = damon_sysfs_scheme_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_groups,
+};
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
+}
+
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
+{
+ struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
+ int i;
+
+ for (i = 0; i < schemes->nr; i++) {
+ damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
+ kobject_put(&schemes_arr[i]->kobj);
+ }
+ schemes->nr = 0;
+ kfree(schemes_arr);
+ schemes->schemes_arr = NULL;
+}
+
+static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
+ int nr_schemes)
+{
+ struct damon_sysfs_scheme **schemes_arr, *scheme;
+ int err, i;
+
+ damon_sysfs_schemes_rm_dirs(schemes);
+ if (!nr_schemes)
+ return 0;
+
+ schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!schemes_arr)
+ return -ENOMEM;
+ schemes->schemes_arr = schemes_arr;
+
+ for (i = 0; i < nr_schemes; i++) {
+ scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+ if (!scheme) {
+ damon_sysfs_schemes_rm_dirs(schemes);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&scheme->kobj,
+ &damon_sysfs_scheme_ktype, &schemes->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+ err = damon_sysfs_scheme_add_dirs(scheme);
+ if (err)
+ goto out;
+
+ schemes_arr[i] = scheme;
+ schemes->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_schemes_rm_dirs(schemes);
+ kobject_put(&scheme->kobj);
+ return err;
+}
+
+static ssize_t nr_schemes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_schemes *schemes = container_of(kobj,
+ struct damon_sysfs_schemes, kobj);
+
+ return sysfs_emit(buf, "%d\n", schemes->nr);
+}
+
+static ssize_t nr_schemes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_schemes *schemes;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_schemes_add_dirs(schemes, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+ return count;
+}
+
+static void damon_sysfs_schemes_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_schemes_nr_attr =
+ __ATTR_RW_MODE(nr_schemes, 0600);
+
+static struct attribute *damon_sysfs_schemes_attrs[] = {
+ &damon_sysfs_schemes_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_schemes);
+
+struct kobj_type damon_sysfs_schemes_ktype = {
+ .release = damon_sysfs_schemes_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_schemes_groups,
+};
+
+static struct damos *damon_sysfs_mk_scheme(
+ struct damon_sysfs_scheme *sysfs_scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ sysfs_scheme->access_pattern;
+ struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+ struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+ struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+ struct damos_access_pattern pattern = {
+ .min_sz_region = access_pattern->sz->min,
+ .max_sz_region = access_pattern->sz->max,
+ .min_nr_accesses = access_pattern->nr_accesses->min,
+ .max_nr_accesses = access_pattern->nr_accesses->max,
+ .min_age_region = access_pattern->age->min,
+ .max_age_region = access_pattern->age->max,
+ };
+ struct damos_quota quota = {
+ .ms = sysfs_quotas->ms,
+ .sz = sysfs_quotas->sz,
+ .reset_interval = sysfs_quotas->reset_interval_ms,
+ .weight_sz = sysfs_weights->sz,
+ .weight_nr_accesses = sysfs_weights->nr_accesses,
+ .weight_age = sysfs_weights->age,
+ };
+ struct damos_watermarks wmarks = {
+ .metric = sysfs_wmarks->metric,
+ .interval = sysfs_wmarks->interval_us,
+ .high = sysfs_wmarks->high,
+ .mid = sysfs_wmarks->mid,
+ .low = sysfs_wmarks->low,
+ };
+
+ return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+ &wmarks);
+}
+
+static void damon_sysfs_update_scheme(struct damos *scheme,
+ struct damon_sysfs_scheme *sysfs_scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ sysfs_scheme->access_pattern;
+ struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+ struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+ struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+ scheme->pattern.min_sz_region = access_pattern->sz->min;
+ scheme->pattern.max_sz_region = access_pattern->sz->max;
+ scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+ scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+ scheme->pattern.min_age_region = access_pattern->age->min;
+ scheme->pattern.max_age_region = access_pattern->age->max;
+
+ scheme->action = sysfs_scheme->action;
+
+ scheme->quota.ms = sysfs_quotas->ms;
+ scheme->quota.sz = sysfs_quotas->sz;
+ scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+ scheme->quota.weight_sz = sysfs_weights->sz;
+ scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+ scheme->quota.weight_age = sysfs_weights->age;
+
+ scheme->wmarks.metric = sysfs_wmarks->metric;
+ scheme->wmarks.interval = sysfs_wmarks->interval_us;
+ scheme->wmarks.high = sysfs_wmarks->high;
+ scheme->wmarks.mid = sysfs_wmarks->mid;
+ scheme->wmarks.low = sysfs_wmarks->low;
+}
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+ struct damon_sysfs_schemes *sysfs_schemes)
+{
+ struct damos *scheme, *next;
+ int i = 0;
+
+ damon_for_each_scheme_safe(scheme, next, ctx) {
+ if (i < sysfs_schemes->nr)
+ damon_sysfs_update_scheme(scheme,
+ sysfs_schemes->schemes_arr[i]);
+ else
+ damon_destroy_scheme(scheme);
+ i++;
+ }
+
+ for (; i < sysfs_schemes->nr; i++) {
+ struct damos *scheme, *next;
+
+ scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
+ if (!scheme) {
+ damon_for_each_scheme_safe(scheme, next, ctx)
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damon_add_scheme(ctx, scheme);
+ }
+ return 0;
+}
+
+void damon_sysfs_schemes_update_stats(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_stats *sysfs_stats;
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ break;
+
+ sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+ sysfs_stats->nr_tried = scheme->stat.nr_tried;
+ sysfs_stats->sz_tried = scheme->stat.sz_tried;
+ sysfs_stats->nr_applied = scheme->stat.nr_applied;
+ sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ }
+}
+
+/*
+ * damon_sysfs_schemes that need to update its schemes regions dir. Protected
+ * by damon_sysfs_lock
+ */
+static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
+static int damon_sysfs_schemes_region_idx;
+
+/*
+ * DAMON callback that called before damos apply. While this callback is
+ * registered, damon_sysfs_lock should be held to ensure the regions
+ * directories exist.
+ */
+static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *s)
+{
+ struct damos *scheme;
+ struct damon_sysfs_scheme_regions *sysfs_regions;
+ struct damon_sysfs_scheme_region *region;
+ struct damon_sysfs_schemes *sysfs_schemes =
+ damon_sysfs_schemes_for_damos_callback;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ if (scheme == s)
+ break;
+ schemes_idx++;
+ }
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ return 0;
+
+ sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+ region = damon_sysfs_scheme_region_alloc(r);
+ list_add_tail(&region->list, &sysfs_regions->regions_list);
+ sysfs_regions->nr_regions++;
+ if (kobject_init_and_add(&region->kobj,
+ &damon_sysfs_scheme_region_ktype,
+ &sysfs_regions->kobj, "%d",
+ damon_sysfs_schemes_region_idx++)) {
+ kobject_put(&region->kobj);
+ }
+ return 0;
+}
+
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_clear_regions(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_scheme *sysfs_scheme;
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ break;
+
+ sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+ damon_sysfs_scheme_regions_rm_dirs(
+ sysfs_scheme->tried_regions);
+ }
+ return 0;
+}
+
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_update_regions_start(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
+ damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+ ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
+ return 0;
+}
+
+/*
+ * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller
+ * should unlock damon_sysfs_lock which held before
+ * damon_sysfs_schemes_update_regions_start()
+ */
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
+{
+ damon_sysfs_schemes_for_damos_callback = NULL;
+ ctx->callback.before_damos_apply = NULL;
+ damon_sysfs_schemes_region_idx = 0;
+ return 0;
+}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7488e27c87c3..aeb0beb1da91 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -5,1069 +5,11 @@
* Copyright (c) 2022 SeongJae Park <sj@kernel.org>
*/
-#include <linux/damon.h>
-#include <linux/kobject.h>
#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
-static DEFINE_MUTEX(damon_sysfs_lock);
-
-/*
- * unsigned long range directory
- */
-
-struct damon_sysfs_ul_range {
- struct kobject kobj;
- unsigned long min;
- unsigned long max;
-};
-
-static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
- unsigned long min,
- unsigned long max)
-{
- struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
- GFP_KERNEL);
-
- if (!range)
- return NULL;
- range->kobj = (struct kobject){};
- range->min = min;
- range->max = max;
-
- return range;
-}
-
-static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
-
- return sysfs_emit(buf, "%lu\n", range->min);
-}
-
-static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
- unsigned long min;
- int err;
-
- err = kstrtoul(buf, 0, &min);
- if (err)
- return -EINVAL;
-
- range->min = min;
- return count;
-}
-
-static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
-
- return sysfs_emit(buf, "%lu\n", range->max);
-}
-
-static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
- unsigned long max;
- int err;
-
- err = kstrtoul(buf, 0, &max);
- if (err)
- return -EINVAL;
-
- range->max = max;
- return count;
-}
-
-static void damon_sysfs_ul_range_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_ul_range_min_attr =
- __ATTR_RW_MODE(min, 0600);
-
-static struct kobj_attribute damon_sysfs_ul_range_max_attr =
- __ATTR_RW_MODE(max, 0600);
-
-static struct attribute *damon_sysfs_ul_range_attrs[] = {
- &damon_sysfs_ul_range_min_attr.attr,
- &damon_sysfs_ul_range_max_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
-
-static struct kobj_type damon_sysfs_ul_range_ktype = {
- .release = damon_sysfs_ul_range_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_ul_range_groups,
-};
-
-/*
- * schemes/stats directory
- */
-
-struct damon_sysfs_stats {
- struct kobject kobj;
- unsigned long nr_tried;
- unsigned long sz_tried;
- unsigned long nr_applied;
- unsigned long sz_applied;
- unsigned long qt_exceeds;
-};
-
-static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
-{
- return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
-}
-
-static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_stats *stats = container_of(kobj,
- struct damon_sysfs_stats, kobj);
-
- return sysfs_emit(buf, "%lu\n", stats->nr_tried);
-}
-
-static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_stats *stats = container_of(kobj,
- struct damon_sysfs_stats, kobj);
-
- return sysfs_emit(buf, "%lu\n", stats->sz_tried);
-}
-
-static ssize_t nr_applied_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_stats *stats = container_of(kobj,
- struct damon_sysfs_stats, kobj);
-
- return sysfs_emit(buf, "%lu\n", stats->nr_applied);
-}
-
-static ssize_t sz_applied_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_stats *stats = container_of(kobj,
- struct damon_sysfs_stats, kobj);
-
- return sysfs_emit(buf, "%lu\n", stats->sz_applied);
-}
-
-static ssize_t qt_exceeds_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_stats *stats = container_of(kobj,
- struct damon_sysfs_stats, kobj);
-
- return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
-}
-
-static void damon_sysfs_stats_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
- __ATTR_RO_MODE(nr_tried, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
- __ATTR_RO_MODE(sz_tried, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
- __ATTR_RO_MODE(nr_applied, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
- __ATTR_RO_MODE(sz_applied, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
- __ATTR_RO_MODE(qt_exceeds, 0400);
-
-static struct attribute *damon_sysfs_stats_attrs[] = {
- &damon_sysfs_stats_nr_tried_attr.attr,
- &damon_sysfs_stats_sz_tried_attr.attr,
- &damon_sysfs_stats_nr_applied_attr.attr,
- &damon_sysfs_stats_sz_applied_attr.attr,
- &damon_sysfs_stats_qt_exceeds_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_stats);
-
-static struct kobj_type damon_sysfs_stats_ktype = {
- .release = damon_sysfs_stats_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_stats_groups,
-};
-
-/*
- * watermarks directory
- */
-
-struct damon_sysfs_watermarks {
- struct kobject kobj;
- enum damos_wmark_metric metric;
- unsigned long interval_us;
- unsigned long high;
- unsigned long mid;
- unsigned long low;
-};
-
-static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
- enum damos_wmark_metric metric, unsigned long interval_us,
- unsigned long high, unsigned long mid, unsigned long low)
-{
- struct damon_sysfs_watermarks *watermarks = kmalloc(
- sizeof(*watermarks), GFP_KERNEL);
-
- if (!watermarks)
- return NULL;
- watermarks->kobj = (struct kobject){};
- watermarks->metric = metric;
- watermarks->interval_us = interval_us;
- watermarks->high = high;
- watermarks->mid = mid;
- watermarks->low = low;
- return watermarks;
-}
-
-/* Should match with enum damos_wmark_metric */
-static const char * const damon_sysfs_wmark_metric_strs[] = {
- "none",
- "free_mem_rate",
-};
-
-static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
-
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_wmark_metric_strs[watermarks->metric]);
-}
-
-static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
- enum damos_wmark_metric metric;
-
- for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
- if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
- watermarks->metric = metric;
- return count;
- }
- }
- return -EINVAL;
-}
-
-static ssize_t interval_us_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
-
- return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
-}
-
-static ssize_t interval_us_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
- int err = kstrtoul(buf, 0, &watermarks->interval_us);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t high_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
-
- return sysfs_emit(buf, "%lu\n", watermarks->high);
-}
-
-static ssize_t high_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
- int err = kstrtoul(buf, 0, &watermarks->high);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t mid_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
-
- return sysfs_emit(buf, "%lu\n", watermarks->mid);
-}
-
-static ssize_t mid_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
- int err = kstrtoul(buf, 0, &watermarks->mid);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t low_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
-
- return sysfs_emit(buf, "%lu\n", watermarks->low);
-}
-
-static ssize_t low_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_watermarks *watermarks = container_of(kobj,
- struct damon_sysfs_watermarks, kobj);
- int err = kstrtoul(buf, 0, &watermarks->low);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static void damon_sysfs_watermarks_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
- __ATTR_RW_MODE(metric, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
- __ATTR_RW_MODE(interval_us, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_high_attr =
- __ATTR_RW_MODE(high, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
- __ATTR_RW_MODE(mid, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_low_attr =
- __ATTR_RW_MODE(low, 0600);
-
-static struct attribute *damon_sysfs_watermarks_attrs[] = {
- &damon_sysfs_watermarks_metric_attr.attr,
- &damon_sysfs_watermarks_interval_us_attr.attr,
- &damon_sysfs_watermarks_high_attr.attr,
- &damon_sysfs_watermarks_mid_attr.attr,
- &damon_sysfs_watermarks_low_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
-
-static struct kobj_type damon_sysfs_watermarks_ktype = {
- .release = damon_sysfs_watermarks_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_watermarks_groups,
-};
-
-/*
- * scheme/weights directory
- */
-
-struct damon_sysfs_weights {
- struct kobject kobj;
- unsigned int sz;
- unsigned int nr_accesses;
- unsigned int age;
-};
-
-static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
- unsigned int nr_accesses, unsigned int age)
-{
- struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
- GFP_KERNEL);
-
- if (!weights)
- return NULL;
- weights->kobj = (struct kobject){};
- weights->sz = sz;
- weights->nr_accesses = nr_accesses;
- weights->age = age;
- return weights;
-}
-
-static ssize_t sz_permil_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_weights *weights = container_of(kobj,
- struct damon_sysfs_weights, kobj);
-
- return sysfs_emit(buf, "%u\n", weights->sz);
-}
-
-static ssize_t sz_permil_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_weights *weights = container_of(kobj,
- struct damon_sysfs_weights, kobj);
- int err = kstrtouint(buf, 0, &weights->sz);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t nr_accesses_permil_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_weights *weights = container_of(kobj,
- struct damon_sysfs_weights, kobj);
-
- return sysfs_emit(buf, "%u\n", weights->nr_accesses);
-}
-
-static ssize_t nr_accesses_permil_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_weights *weights = container_of(kobj,
- struct damon_sysfs_weights, kobj);
- int err = kstrtouint(buf, 0, &weights->nr_accesses);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t age_permil_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_weights *weights = container_of(kobj,
- struct damon_sysfs_weights, kobj);
-
- return sysfs_emit(buf, "%u\n", weights->age);
-}
-
-static ssize_t age_permil_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_weights *weights = container_of(kobj,
- struct damon_sysfs_weights, kobj);
- int err = kstrtouint(buf, 0, &weights->age);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static void damon_sysfs_weights_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_weights_sz_attr =
- __ATTR_RW_MODE(sz_permil, 0600);
-
-static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
- __ATTR_RW_MODE(nr_accesses_permil, 0600);
-
-static struct kobj_attribute damon_sysfs_weights_age_attr =
- __ATTR_RW_MODE(age_permil, 0600);
-
-static struct attribute *damon_sysfs_weights_attrs[] = {
- &damon_sysfs_weights_sz_attr.attr,
- &damon_sysfs_weights_nr_accesses_attr.attr,
- &damon_sysfs_weights_age_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_weights);
-
-static struct kobj_type damon_sysfs_weights_ktype = {
- .release = damon_sysfs_weights_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_weights_groups,
-};
-
-/*
- * quotas directory
- */
-
-struct damon_sysfs_quotas {
- struct kobject kobj;
- struct damon_sysfs_weights *weights;
- unsigned long ms;
- unsigned long sz;
- unsigned long reset_interval_ms;
-};
-
-static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
-{
- return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
-}
-
-static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
-{
- struct damon_sysfs_weights *weights;
- int err;
-
- weights = damon_sysfs_weights_alloc(0, 0, 0);
- if (!weights)
- return -ENOMEM;
-
- err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
- &quotas->kobj, "weights");
- if (err)
- kobject_put(&weights->kobj);
- else
- quotas->weights = weights;
- return err;
-}
-
-static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
-{
- kobject_put(&quotas->weights->kobj);
-}
-
-static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_quotas *quotas = container_of(kobj,
- struct damon_sysfs_quotas, kobj);
-
- return sysfs_emit(buf, "%lu\n", quotas->ms);
-}
-
-static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_quotas *quotas = container_of(kobj,
- struct damon_sysfs_quotas, kobj);
- int err = kstrtoul(buf, 0, &quotas->ms);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_quotas *quotas = container_of(kobj,
- struct damon_sysfs_quotas, kobj);
-
- return sysfs_emit(buf, "%lu\n", quotas->sz);
-}
-
-static ssize_t bytes_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_quotas *quotas = container_of(kobj,
- struct damon_sysfs_quotas, kobj);
- int err = kstrtoul(buf, 0, &quotas->sz);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static ssize_t reset_interval_ms_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_quotas *quotas = container_of(kobj,
- struct damon_sysfs_quotas, kobj);
-
- return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
-}
-
-static ssize_t reset_interval_ms_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_quotas *quotas = container_of(kobj,
- struct damon_sysfs_quotas, kobj);
- int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
-
- if (err)
- return -EINVAL;
- return count;
-}
-
-static void damon_sysfs_quotas_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_quotas_ms_attr =
- __ATTR_RW_MODE(ms, 0600);
-
-static struct kobj_attribute damon_sysfs_quotas_sz_attr =
- __ATTR_RW_MODE(bytes, 0600);
-
-static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
- __ATTR_RW_MODE(reset_interval_ms, 0600);
-
-static struct attribute *damon_sysfs_quotas_attrs[] = {
- &damon_sysfs_quotas_ms_attr.attr,
- &damon_sysfs_quotas_sz_attr.attr,
- &damon_sysfs_quotas_reset_interval_ms_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_quotas);
-
-static struct kobj_type damon_sysfs_quotas_ktype = {
- .release = damon_sysfs_quotas_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_quotas_groups,
-};
-
-/*
- * access_pattern directory
- */
-
-struct damon_sysfs_access_pattern {
- struct kobject kobj;
- struct damon_sysfs_ul_range *sz;
- struct damon_sysfs_ul_range *nr_accesses;
- struct damon_sysfs_ul_range *age;
-};
-
-static
-struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
-{
- struct damon_sysfs_access_pattern *access_pattern =
- kmalloc(sizeof(*access_pattern), GFP_KERNEL);
-
- if (!access_pattern)
- return NULL;
- access_pattern->kobj = (struct kobject){};
- return access_pattern;
-}
-
-static int damon_sysfs_access_pattern_add_range_dir(
- struct damon_sysfs_access_pattern *access_pattern,
- struct damon_sysfs_ul_range **range_dir_ptr,
- char *name)
-{
- struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
- int err;
-
- if (!range)
- return -ENOMEM;
- err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
- &access_pattern->kobj, name);
- if (err)
- kobject_put(&range->kobj);
- else
- *range_dir_ptr = range;
- return err;
-}
-
-static int damon_sysfs_access_pattern_add_dirs(
- struct damon_sysfs_access_pattern *access_pattern)
-{
- int err;
-
- err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
- &access_pattern->sz, "sz");
- if (err)
- goto put_sz_out;
-
- err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
- &access_pattern->nr_accesses, "nr_accesses");
- if (err)
- goto put_nr_accesses_sz_out;
-
- err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
- &access_pattern->age, "age");
- if (err)
- goto put_age_nr_accesses_sz_out;
- return 0;
-
-put_age_nr_accesses_sz_out:
- kobject_put(&access_pattern->age->kobj);
- access_pattern->age = NULL;
-put_nr_accesses_sz_out:
- kobject_put(&access_pattern->nr_accesses->kobj);
- access_pattern->nr_accesses = NULL;
-put_sz_out:
- kobject_put(&access_pattern->sz->kobj);
- access_pattern->sz = NULL;
- return err;
-}
-
-static void damon_sysfs_access_pattern_rm_dirs(
- struct damon_sysfs_access_pattern *access_pattern)
-{
- kobject_put(&access_pattern->sz->kobj);
- kobject_put(&access_pattern->nr_accesses->kobj);
- kobject_put(&access_pattern->age->kobj);
-}
-
-static void damon_sysfs_access_pattern_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
-}
-
-static struct attribute *damon_sysfs_access_pattern_attrs[] = {
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
-
-static struct kobj_type damon_sysfs_access_pattern_ktype = {
- .release = damon_sysfs_access_pattern_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_access_pattern_groups,
-};
-
-/*
- * scheme directory
- */
-
-struct damon_sysfs_scheme {
- struct kobject kobj;
- enum damos_action action;
- struct damon_sysfs_access_pattern *access_pattern;
- struct damon_sysfs_quotas *quotas;
- struct damon_sysfs_watermarks *watermarks;
- struct damon_sysfs_stats *stats;
-};
-
-/* This should match with enum damos_action */
-static const char * const damon_sysfs_damos_action_strs[] = {
- "willneed",
- "cold",
- "pageout",
- "hugepage",
- "nohugepage",
- "lru_prio",
- "lru_deprio",
- "stat",
-};
-
-static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
- enum damos_action action)
-{
- struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
- GFP_KERNEL);
-
- if (!scheme)
- return NULL;
- scheme->kobj = (struct kobject){};
- scheme->action = action;
- return scheme;
-}
-
-static int damon_sysfs_scheme_set_access_pattern(
- struct damon_sysfs_scheme *scheme)
-{
- struct damon_sysfs_access_pattern *access_pattern;
- int err;
-
- access_pattern = damon_sysfs_access_pattern_alloc();
- if (!access_pattern)
- return -ENOMEM;
- err = kobject_init_and_add(&access_pattern->kobj,
- &damon_sysfs_access_pattern_ktype, &scheme->kobj,
- "access_pattern");
- if (err)
- goto out;
- err = damon_sysfs_access_pattern_add_dirs(access_pattern);
- if (err)
- goto out;
- scheme->access_pattern = access_pattern;
- return 0;
-
-out:
- kobject_put(&access_pattern->kobj);
- return err;
-}
-
-static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
-{
- struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
- int err;
-
- if (!quotas)
- return -ENOMEM;
- err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
- &scheme->kobj, "quotas");
- if (err)
- goto out;
- err = damon_sysfs_quotas_add_dirs(quotas);
- if (err)
- goto out;
- scheme->quotas = quotas;
- return 0;
-
-out:
- kobject_put(&quotas->kobj);
- return err;
-}
-
-static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
-{
- struct damon_sysfs_watermarks *watermarks =
- damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
- int err;
-
- if (!watermarks)
- return -ENOMEM;
- err = kobject_init_and_add(&watermarks->kobj,
- &damon_sysfs_watermarks_ktype, &scheme->kobj,
- "watermarks");
- if (err)
- kobject_put(&watermarks->kobj);
- else
- scheme->watermarks = watermarks;
- return err;
-}
-
-static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
-{
- struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
- int err;
-
- if (!stats)
- return -ENOMEM;
- err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
- &scheme->kobj, "stats");
- if (err)
- kobject_put(&stats->kobj);
- else
- scheme->stats = stats;
- return err;
-}
-
-static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
-{
- int err;
-
- err = damon_sysfs_scheme_set_access_pattern(scheme);
- if (err)
- return err;
- err = damon_sysfs_scheme_set_quotas(scheme);
- if (err)
- goto put_access_pattern_out;
- err = damon_sysfs_scheme_set_watermarks(scheme);
- if (err)
- goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_stats(scheme);
- if (err)
- goto put_watermarks_quotas_access_pattern_out;
- return 0;
-
-put_watermarks_quotas_access_pattern_out:
- kobject_put(&scheme->watermarks->kobj);
- scheme->watermarks = NULL;
-put_quotas_access_pattern_out:
- kobject_put(&scheme->quotas->kobj);
- scheme->quotas = NULL;
-put_access_pattern_out:
- kobject_put(&scheme->access_pattern->kobj);
- scheme->access_pattern = NULL;
- return err;
-}
-
-static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
-{
- damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
- kobject_put(&scheme->access_pattern->kobj);
- damon_sysfs_quotas_rm_dirs(scheme->quotas);
- kobject_put(&scheme->quotas->kobj);
- kobject_put(&scheme->watermarks->kobj);
- kobject_put(&scheme->stats->kobj);
-}
-
-static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_scheme *scheme = container_of(kobj,
- struct damon_sysfs_scheme, kobj);
-
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_damos_action_strs[scheme->action]);
-}
-
-static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_scheme *scheme = container_of(kobj,
- struct damon_sysfs_scheme, kobj);
- enum damos_action action;
-
- for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
- if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
- scheme->action = action;
- return count;
- }
- }
- return -EINVAL;
-}
-
-static void damon_sysfs_scheme_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_scheme_action_attr =
- __ATTR_RW_MODE(action, 0600);
-
-static struct attribute *damon_sysfs_scheme_attrs[] = {
- &damon_sysfs_scheme_action_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_scheme);
-
-static struct kobj_type damon_sysfs_scheme_ktype = {
- .release = damon_sysfs_scheme_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_scheme_groups,
-};
-
-/*
- * schemes directory
- */
-
-struct damon_sysfs_schemes {
- struct kobject kobj;
- struct damon_sysfs_scheme **schemes_arr;
- int nr;
-};
-
-static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
-{
- return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
-}
-
-static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
-{
- struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
- int i;
-
- for (i = 0; i < schemes->nr; i++) {
- damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
- kobject_put(&schemes_arr[i]->kobj);
- }
- schemes->nr = 0;
- kfree(schemes_arr);
- schemes->schemes_arr = NULL;
-}
-
-static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
- int nr_schemes)
-{
- struct damon_sysfs_scheme **schemes_arr, *scheme;
- int err, i;
-
- damon_sysfs_schemes_rm_dirs(schemes);
- if (!nr_schemes)
- return 0;
-
- schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
- GFP_KERNEL | __GFP_NOWARN);
- if (!schemes_arr)
- return -ENOMEM;
- schemes->schemes_arr = schemes_arr;
-
- for (i = 0; i < nr_schemes; i++) {
- scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
- if (!scheme) {
- damon_sysfs_schemes_rm_dirs(schemes);
- return -ENOMEM;
- }
-
- err = kobject_init_and_add(&scheme->kobj,
- &damon_sysfs_scheme_ktype, &schemes->kobj,
- "%d", i);
- if (err)
- goto out;
- err = damon_sysfs_scheme_add_dirs(scheme);
- if (err)
- goto out;
-
- schemes_arr[i] = scheme;
- schemes->nr++;
- }
- return 0;
-
-out:
- damon_sysfs_schemes_rm_dirs(schemes);
- kobject_put(&scheme->kobj);
- return err;
-}
-
-static ssize_t nr_schemes_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct damon_sysfs_schemes *schemes = container_of(kobj,
- struct damon_sysfs_schemes, kobj);
-
- return sysfs_emit(buf, "%d\n", schemes->nr);
-}
-
-static ssize_t nr_schemes_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- struct damon_sysfs_schemes *schemes = container_of(kobj,
- struct damon_sysfs_schemes, kobj);
- int nr, err = kstrtoint(buf, 0, &nr);
-
- if (err)
- return err;
- if (nr < 0)
- return -EINVAL;
-
- if (!mutex_trylock(&damon_sysfs_lock))
- return -EBUSY;
- err = damon_sysfs_schemes_add_dirs(schemes, nr);
- mutex_unlock(&damon_sysfs_lock);
- if (err)
- return err;
- return count;
-}
-
-static void damon_sysfs_schemes_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_schemes_nr_attr =
- __ATTR_RW_MODE(nr_schemes, 0600);
-
-static struct attribute *damon_sysfs_schemes_attrs[] = {
- &damon_sysfs_schemes_nr_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_schemes);
-
-static struct kobj_type damon_sysfs_schemes_ktype = {
- .release = damon_sysfs_schemes_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_schemes_groups,
-};
+#include "sysfs-common.h"
/*
* init region directory
@@ -1075,23 +17,12 @@ static struct kobj_type damon_sysfs_schemes_ktype = {
struct damon_sysfs_region {
struct kobject kobj;
- unsigned long start;
- unsigned long end;
+ struct damon_addr_range ar;
};
-static struct damon_sysfs_region *damon_sysfs_region_alloc(
- unsigned long start,
- unsigned long end)
+static struct damon_sysfs_region *damon_sysfs_region_alloc(void)
{
- struct damon_sysfs_region *region = kmalloc(sizeof(*region),
- GFP_KERNEL);
-
- if (!region)
- return NULL;
- region->kobj = (struct kobject){};
- region->start = start;
- region->end = end;
- return region;
+ return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL);
}
static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1100,7 +31,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- return sysfs_emit(buf, "%lu\n", region->start);
+ return sysfs_emit(buf, "%lu\n", region->ar.start);
}
static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1108,11 +39,9 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- int err = kstrtoul(buf, 0, &region->start);
+ int err = kstrtoul(buf, 0, &region->ar.start);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1121,7 +50,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- return sysfs_emit(buf, "%lu\n", region->end);
+ return sysfs_emit(buf, "%lu\n", region->ar.end);
}
static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1129,11 +58,9 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- int err = kstrtoul(buf, 0, &region->end);
+ int err = kstrtoul(buf, 0, &region->ar.end);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static void damon_sysfs_region_release(struct kobject *kobj)
@@ -1204,7 +131,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
regions->regions_arr = regions_arr;
for (i = 0; i < nr_regions; i++) {
- region = damon_sysfs_region_alloc(0, 0);
+ region = damon_sysfs_region_alloc();
if (!region) {
damon_sysfs_regions_rm_dirs(regions);
return -ENOMEM;
@@ -1237,8 +164,7 @@ static ssize_t nr_regions_show(struct kobject *kobj,
static ssize_t nr_regions_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_regions *regions = container_of(kobj,
- struct damon_sysfs_regions, kobj);
+ struct damon_sysfs_regions *regions;
int nr, err = kstrtoint(buf, 0, &nr);
if (err)
@@ -1246,6 +172,8 @@ static ssize_t nr_regions_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ regions = container_of(kobj, struct damon_sysfs_regions, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_regions_add_dirs(regions, nr);
@@ -1440,8 +368,7 @@ static ssize_t nr_targets_show(struct kobject *kobj,
static ssize_t nr_targets_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_targets *targets = container_of(kobj,
- struct damon_sysfs_targets, kobj);
+ struct damon_sysfs_targets *targets;
int nr, err = kstrtoint(buf, 0, &nr);
if (err)
@@ -1449,6 +376,8 @@ static ssize_t nr_targets_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ targets = container_of(kobj, struct damon_sysfs_targets, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_targets_add_dirs(targets, nr);
@@ -1525,7 +454,7 @@ static ssize_t sample_us_store(struct kobject *kobj,
int err = kstrtoul(buf, 0, &us);
if (err)
- return -EINVAL;
+ return err;
intervals->sample_us = us;
return count;
@@ -1549,7 +478,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr,
int err = kstrtoul(buf, 0, &us);
if (err)
- return -EINVAL;
+ return err;
intervals->aggr_us = us;
return count;
@@ -1573,7 +502,7 @@ static ssize_t update_us_store(struct kobject *kobj,
int err = kstrtoul(buf, 0, &us);
if (err)
- return -EINVAL;
+ return err;
intervals->update_us = us;
return count;
@@ -1962,8 +891,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj,
static ssize_t nr_contexts_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_contexts *contexts = container_of(kobj,
- struct damon_sysfs_contexts, kobj);
+ struct damon_sysfs_contexts *contexts;
int nr, err;
err = kstrtoint(buf, 0, &nr);
@@ -1973,6 +901,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj,
if (nr < 0 || 1 < nr)
return -EINVAL;
+ contexts = container_of(kobj, struct damon_sysfs_contexts, kobj);
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_contexts_add_dirs(contexts, nr);
@@ -2071,6 +1000,16 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
+ * regions
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS,
+ /*
+ * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried
+ * regions
+ */
+ DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS,
+ /*
* @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
*/
NR_DAMON_SYSFS_CMDS,
@@ -2082,6 +1021,8 @@ static const char * const damon_sysfs_cmd_strs[] = {
"off",
"commit",
"update_schemes_stats",
+ "update_schemes_tried_regions",
+ "clear_schemes_tried_regions",
};
/*
@@ -2127,18 +1068,23 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
struct damon_sysfs_ul_range *sys_nr_regions =
sys_attrs->nr_regions_range;
-
- return damon_set_attrs(ctx, sys_intervals->sample_us,
- sys_intervals->aggr_us, sys_intervals->update_us,
- sys_nr_regions->min, sys_nr_regions->max);
+ struct damon_attrs attrs = {
+ .sample_interval = sys_intervals->sample_us,
+ .aggr_interval = sys_intervals->aggr_us,
+ .ops_update_interval = sys_intervals->update_us,
+ .min_nr_regions = sys_nr_regions->min,
+ .max_nr_regions = sys_nr_regions->max,
+ };
+ return damon_set_attrs(ctx, &attrs);
}
static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
+ bool has_pid = damon_target_has_pid(ctx);
damon_for_each_target_safe(t, next, ctx) {
- if (damon_target_has_pid(ctx))
+ if (has_pid)
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -2157,11 +1103,11 @@ static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_region *sys_region =
sysfs_regions->regions_arr[i];
- if (sys_region->start > sys_region->end)
+ if (sys_region->ar.start > sys_region->ar.end)
goto out;
- ranges[i].start = sys_region->start;
- ranges[i].end = sys_region->end;
+ ranges[i].start = sys_region->ar.start;
+ ranges[i].end = sys_region->ar.end;
if (i == 0)
continue;
if (ranges[i - 1].end > ranges[i].start)
@@ -2182,12 +1128,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (!t)
return -ENOMEM;
+ damon_add_target(ctx, t);
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
}
- damon_add_target(ctx, t);
err = damon_sysfs_set_regions(t, sys_target->regions);
if (err)
goto destroy_targets_out;
@@ -2256,60 +1202,21 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
return 0;
}
-static struct damos *damon_sysfs_mk_scheme(
- struct damon_sysfs_scheme *sysfs_scheme)
-{
- struct damon_sysfs_access_pattern *pattern =
- sysfs_scheme->access_pattern;
- struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
- struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
- struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
- struct damos_quota quota = {
- .ms = sysfs_quotas->ms,
- .sz = sysfs_quotas->sz,
- .reset_interval = sysfs_quotas->reset_interval_ms,
- .weight_sz = sysfs_weights->sz,
- .weight_nr_accesses = sysfs_weights->nr_accesses,
- .weight_age = sysfs_weights->age,
- };
- struct damos_watermarks wmarks = {
- .metric = sysfs_wmarks->metric,
- .interval = sysfs_wmarks->interval_us,
- .high = sysfs_wmarks->high,
- .mid = sysfs_wmarks->mid,
- .low = sysfs_wmarks->low,
- };
-
- return damon_new_scheme(pattern->sz->min, pattern->sz->max,
- pattern->nr_accesses->min, pattern->nr_accesses->max,
- pattern->age->min, pattern->age->max,
- sysfs_scheme->action, &quota, &wmarks);
-}
-
-static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
- struct damon_sysfs_schemes *sysfs_schemes)
-{
- int i;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- struct damos *scheme, *next;
-
- scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
- if (!scheme) {
- damon_for_each_scheme_safe(scheme, next, ctx)
- damon_destroy_scheme(scheme);
- return -ENOMEM;
- }
- damon_add_scheme(ctx, scheme);
- }
- return 0;
-}
-
static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
+ struct damon_sysfs_kdamond *kdamond;
- if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR)
+ /* damon_sysfs_schemes_update_regions_stop() might not yet called */
+ kdamond = damon_sysfs_cmd_request.kdamond;
+ if (kdamond && damon_sysfs_cmd_request.cmd ==
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
+ ctx == kdamond->damon_ctx) {
+ damon_sysfs_schemes_update_regions_stop(ctx);
+ mutex_unlock(&damon_sysfs_lock);
+ }
+
+ if (!damon_target_has_pid(ctx))
return;
mutex_lock(&ctx->kdamond_lock);
@@ -2332,26 +1239,46 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx = kdamond->damon_ctx;
- struct damon_sysfs_schemes *sysfs_schemes;
- struct damos *scheme;
- int schemes_idx = 0;
if (!ctx)
return -EINVAL;
- sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
- damon_for_each_scheme(scheme, ctx) {
- struct damon_sysfs_stats *sysfs_stats;
-
- sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
- sysfs_stats->nr_tried = scheme->stat.nr_tried;
- sysfs_stats->sz_tried = scheme->stat.sz_tried;
- sysfs_stats->nr_applied = scheme->stat.nr_applied;
- sysfs_stats->sz_applied = scheme->stat.sz_applied;
- sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
- }
+ damon_sysfs_schemes_update_stats(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
+static int damon_sysfs_upd_schemes_regions_start(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ return damon_sysfs_schemes_update_regions_start(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
+static int damon_sysfs_upd_schemes_regions_stop(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ return damon_sysfs_schemes_update_regions_stop(ctx);
+}
+
+static int damon_sysfs_clear_schemes_regions(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ return damon_sysfs_schemes_clear_regions(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
static inline bool damon_sysfs_kdamond_running(
struct damon_sysfs_kdamond *kdamond)
{
@@ -2404,10 +1331,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
{
struct damon_sysfs_kdamond *kdamond;
+ static bool damon_sysfs_schemes_regions_updating;
int err = 0;
/* avoid deadlock due to concurrent state_store('off') */
- if (!mutex_trylock(&damon_sysfs_lock))
+ if (!damon_sysfs_schemes_regions_updating &&
+ !mutex_trylock(&damon_sysfs_lock))
return 0;
kdamond = damon_sysfs_cmd_request.kdamond;
if (!kdamond || kdamond->damon_ctx != c)
@@ -2419,13 +1348,30 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
case DAMON_SYSFS_CMD_COMMIT:
err = damon_sysfs_commit_input(kdamond);
break;
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+ if (!damon_sysfs_schemes_regions_updating) {
+ err = damon_sysfs_upd_schemes_regions_start(kdamond);
+ if (!err) {
+ damon_sysfs_schemes_regions_updating = true;
+ goto keep_lock_out;
+ }
+ } else {
+ err = damon_sysfs_upd_schemes_regions_stop(kdamond);
+ damon_sysfs_schemes_regions_updating = false;
+ }
+ break;
+ case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+ err = damon_sysfs_clear_schemes_regions(kdamond);
+ break;
default:
break;
}
/* Mark the request as invalid now. */
damon_sysfs_cmd_request.kdamond = NULL;
out:
- mutex_unlock(&damon_sysfs_lock);
+ if (!damon_sysfs_schemes_regions_updating)
+ mutex_unlock(&damon_sysfs_lock);
+keep_lock_out:
return err;
}
@@ -2455,8 +1401,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
struct damon_ctx *ctx;
int err;
- if (kdamond->damon_ctx &&
- damon_sysfs_ctx_running(kdamond->damon_ctx))
+ if (damon_sysfs_kdamond_running(kdamond))
return -EBUSY;
if (damon_sysfs_cmd_request.kdamond == kdamond)
return -EBUSY;
@@ -2579,19 +1524,16 @@ static ssize_t pid_show(struct kobject *kobj,
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
struct damon_sysfs_kdamond, kobj);
struct damon_ctx *ctx;
- int pid;
+ int pid = -1;
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
ctx = kdamond->damon_ctx;
- if (!ctx) {
- pid = -1;
+ if (!ctx)
goto out;
- }
+
mutex_lock(&ctx->kdamond_lock);
- if (!ctx->kdamond)
- pid = -1;
- else
+ if (ctx->kdamond)
pid = ctx->kdamond->pid;
mutex_unlock(&ctx->kdamond_lock);
out:
@@ -2657,23 +1599,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds)
kdamonds->kdamonds_arr = NULL;
}
-static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds,
+static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
int nr_kdamonds)
{
- int nr_running_ctxs = 0;
int i;
for (i = 0; i < nr_kdamonds; i++) {
- struct damon_ctx *ctx = kdamonds[i]->damon_ctx;
-
- if (!ctx)
- continue;
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond)
- nr_running_ctxs++;
- mutex_unlock(&ctx->kdamond_lock);
+ if (damon_sysfs_kdamond_running(kdamonds[i]) ||
+ damon_sysfs_cmd_request.kdamond == kdamonds[i])
+ return true;
}
- return nr_running_ctxs;
+
+ return false;
}
static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
@@ -2682,15 +1619,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
struct damon_sysfs_kdamond **kdamonds_arr, *kdamond;
int err, i;
- if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
+ if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr))
return -EBUSY;
- for (i = 0; i < kdamonds->nr; i++) {
- if (damon_sysfs_cmd_request.kdamond ==
- kdamonds->kdamonds_arr[i])
- return -EBUSY;
- }
-
damon_sysfs_kdamonds_rm_dirs(kdamonds);
if (!nr_kdamonds)
return 0;
@@ -2741,8 +1672,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj,
static ssize_t nr_kdamonds_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
- struct damon_sysfs_kdamonds, kobj);
+ struct damon_sysfs_kdamonds *kdamonds;
int nr, err;
err = kstrtoint(buf, 0, &nr);
@@ -2751,6 +1681,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr);
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index d4f55f349100..bce37c487540 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -14,33 +14,19 @@
#include <kunit/test.h>
-static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
+static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
+ ssize_t nr_vmas)
{
- int i, j;
- unsigned long largest_gap, gap;
+ int i;
+ MA_STATE(mas, mt, 0, 0);
if (!nr_vmas)
return;
- for (i = 0; i < nr_vmas - 1; i++) {
- vmas[i].vm_next = &vmas[i + 1];
-
- vmas[i].vm_rb.rb_left = NULL;
- vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb;
-
- largest_gap = 0;
- for (j = i; j < nr_vmas; j++) {
- if (j == 0)
- continue;
- gap = vmas[j].vm_start - vmas[j - 1].vm_end;
- if (gap > largest_gap)
- largest_gap = gap;
- }
- vmas[i].rb_subtree_gap = largest_gap;
- }
- vmas[i].vm_next = NULL;
- vmas[i].vm_rb.rb_right = NULL;
- vmas[i].rb_subtree_gap = 0;
+ mas_lock(&mas);
+ for (i = 0; i < nr_vmas; i++)
+ vma_mas_store(&vmas[i], &mas);
+ mas_unlock(&mas);
}
/*
@@ -72,6 +58,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
*/
static void damon_test_three_regions_in_vmas(struct kunit *test)
{
+ static struct mm_struct mm;
struct damon_addr_range regions[3] = {0,};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
struct vm_area_struct vmas[] = {
@@ -83,9 +70,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
};
- __link_vmas(vmas, 6);
+ mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+ __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas));
- __damon_va_three_regions(&vmas[0], regions);
+ __damon_va_three_regions(&mm, regions);
KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 3c7b9d6dca95..15f03df66db6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
return -EINVAL;
orig_end = r->ar.end;
- sz_orig = r->ar.end - r->ar.start;
+ sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
if (!sz_piece)
@@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r)
*
* Returns 0 if success, or negative error code otherwise.
*/
-static int __damon_va_three_regions(struct vm_area_struct *vma,
+static int __damon_va_three_regions(struct mm_struct *mm,
struct damon_addr_range regions[3])
{
- struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0};
- struct vm_area_struct *last_vma = NULL;
- unsigned long start = 0;
- struct rb_root rbroot;
-
- /* Find two biggest gaps so that first_gap > second_gap > others */
- for (; vma; vma = vma->vm_next) {
- if (!last_vma) {
- start = vma->vm_start;
- goto next;
- }
+ struct damon_addr_range first_gap = {0}, second_gap = {0};
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma, *prev = NULL;
+ unsigned long start;
- if (vma->rb_subtree_gap <= sz_range(&second_gap)) {
- rbroot.rb_node = &vma->vm_rb;
- vma = rb_entry(rb_last(&rbroot),
- struct vm_area_struct, vm_rb);
+ /*
+ * Find the two biggest gaps so that first_gap > second_gap > others.
+ * If this is too slow, it can be optimised to examine the maple
+ * tree gaps.
+ */
+ for_each_vma(vmi, vma) {
+ unsigned long gap;
+
+ if (!prev) {
+ start = vma->vm_start;
goto next;
}
-
- gap.start = last_vma->vm_end;
- gap.end = vma->vm_start;
- if (sz_range(&gap) > sz_range(&second_gap)) {
- swap(gap, second_gap);
- if (sz_range(&second_gap) > sz_range(&first_gap))
- swap(second_gap, first_gap);
+ gap = vma->vm_start - prev->vm_end;
+
+ if (gap > sz_range(&first_gap)) {
+ second_gap = first_gap;
+ first_gap.start = prev->vm_end;
+ first_gap.end = vma->vm_start;
+ } else if (gap > sz_range(&second_gap)) {
+ second_gap.start = prev->vm_end;
+ second_gap.end = vma->vm_start;
}
next:
- last_vma = vma;
+ prev = vma;
}
if (!sz_range(&second_gap) || !sz_range(&first_gap))
@@ -159,7 +160,7 @@ next:
regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
- regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION);
+ regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);
return 0;
}
@@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t,
return -EINVAL;
mmap_read_lock(mm);
- rc = __damon_va_three_regions(mm->mmap, regions);
+ rc = __damon_va_three_regions(mm, regions);
mmap_read_unlock(mm);
mmput(mm);
@@ -250,8 +251,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
for (i = 0; i < 3; i++)
sz += regions[i].end - regions[i].start;
- if (ctx->min_nr_regions)
- sz /= ctx->min_nr_regions;
+ if (ctx->attrs.min_nr_regions)
+ sz /= ctx->attrs.min_nr_regions;
if (sz < DAMON_MIN_REGION)
sz = DAMON_MIN_REGION;
@@ -302,9 +303,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd)) {
ptl = pmd_lock(walk->mm, pmd);
- if (pmd_huge(*pmd)) {
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_huge(*pmd)) {
damon_pmdp_mkold(pmd, walk->mm, addr);
spin_unlock(ptl);
return 0;
@@ -391,8 +397,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
* Functions for the access checking of the regions
*/
-static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
- struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_prepare_access_check(struct mm_struct *mm,
+ struct damon_region *r)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@@ -410,7 +416,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t)
- __damon_va_prepare_access_check(ctx, mm, r);
+ __damon_va_prepare_access_check(mm, r);
mmput(mm);
}
}
@@ -429,9 +435,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd)) {
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_huge(*pmd)) {
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!pmd_trans_huge(*pmd)) {
spin_unlock(ptl);
goto regular_page;
}
@@ -532,16 +543,15 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
* mm 'mm_struct' for the given virtual address space
* r the region to be checked
*/
-static void __damon_va_check_access(struct damon_ctx *ctx,
- struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_check_access(struct mm_struct *mm,
+ struct damon_region *r, bool same_target)
{
- static struct mm_struct *last_mm;
static unsigned long last_addr;
static unsigned long last_page_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) ==
+ if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
if (last_accessed)
r->nr_accesses++;
@@ -552,7 +562,6 @@ static void __damon_va_check_access(struct damon_ctx *ctx,
if (last_accessed)
r->nr_accesses++;
- last_mm = mm;
last_addr = r->sampling_addr;
}
@@ -562,14 +571,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
struct mm_struct *mm;
struct damon_region *r;
unsigned int max_nr_accesses = 0;
+ bool same_target;
damon_for_each_target(t, ctx) {
mm = damon_get_mm(t);
if (!mm)
continue;
+ same_target = false;
damon_for_each_region(r, t) {
- __damon_va_check_access(ctx, mm, r);
+ __damon_va_check_access(mm, r, same_target);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+ same_target = true;
}
mmput(mm);
}
@@ -581,9 +593,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
* Functions for the target validity check and cleanup
*/
-static bool damon_va_target_valid(void *target)
+static bool damon_va_target_valid(struct damon_target *t)
{
- struct damon_target *t = target;
struct task_struct *task;
task = damon_get_task_struct(t);
@@ -607,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target,
{
struct mm_struct *mm;
unsigned long start = PAGE_ALIGN(r->ar.start);
- unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+ unsigned long len = PAGE_ALIGN(damon_sz_region(r));
unsigned long applied;
mm = damon_get_mm(target);
@@ -646,6 +657,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_STAT:
return 0;
default:
+ /*
+ * DAMOS actions that are not yet supported by 'vaddr'.
+ */
return 0;
}
@@ -659,7 +673,7 @@ static int damon_va_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pageout_score(context, r, scheme);
+ return damon_cold_score(context, r, scheme);
default:
break;
}
diff --git a/mm/debug.c b/mm/debug.c
index bef329bf28f0..7f8e5f744e42 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -94,9 +94,10 @@ static void __dump_page(struct page *page)
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
- pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+ pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n",
head, compound_order(head),
- folio_entire_mapcount(folio),
+ head_compound_mapcount(head),
+ head_subpages_mapcount(head),
head_compound_pincount(head));
}
@@ -139,13 +140,11 @@ EXPORT_SYMBOL(dump_page);
void dump_vma(const struct vm_area_struct *vma)
{
- pr_emerg("vma %px start %px end %px\n"
- "next %px prev %px mm %px\n"
+ pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
- vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
- vma->vm_prev, vma->vm_mm,
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
@@ -155,11 +154,11 @@ EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
- pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n"
+ pr_emerg("mm %px task_size %lu\n"
#ifdef CONFIG_MMU
"get_unmapped_area %px\n"
#endif
- "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+ "mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
@@ -183,11 +182,11 @@ void dump_mm(const struct mm_struct *mm)
"tlb_flush_pending %d\n"
"def_flags: %#lx(%pGv)\n",
- mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size,
+ mm, mm->task_size,
#ifdef CONFIG_MMU
mm->get_unmapped_area,
#endif
- mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+ mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index dc7df1254f0a..c631ade3f1d2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -38,11 +38,7 @@
* Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
* expectations that are being validated here. All future changes in here
* or the documentation need to be in sync.
- */
-
-#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
-
-/*
+ *
* On s390 platform, the lower 4 bits are used to identify given page table
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
@@ -175,18 +171,6 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
}
-static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
-{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
-
- if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
- return;
-
- pr_debug("Validating PTE saved write\n");
- WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
- WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
{
@@ -306,22 +290,6 @@ static void __init pmd_leaf_tests(struct pgtable_debug_args *args)
WARN_ON(!pmd_leaf(pmd));
}
-static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args)
-{
- pmd_t pmd;
-
- if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
- return;
-
- if (!has_transparent_hugepage())
- return;
-
- pr_debug("Validating PMD saved write\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none);
- WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
- WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
-}
-
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
{
@@ -455,7 +423,6 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { }
static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { }
static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
@@ -1125,7 +1092,7 @@ static int __init init_args(struct pgtable_debug_args *args)
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
- args->page_prot = vm_get_page_prot(VMFLAGS);
+ args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS);
args->page_prot_none = vm_get_page_prot(VM_NONE);
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
@@ -1292,9 +1259,6 @@ static int __init debug_vm_pgtable(void)
pmd_leaf_tests(&args);
pud_leaf_tests(&args);
- pte_savedwrite_tests(&args);
- pmd_savedwrite_tests(&args);
-
pte_special_tests(&args);
pte_protnone_tests(&args);
pmd_protnone_tests(&args);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index c76ee665355a..bf04fec87f35 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -72,7 +72,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
- endbyte = -1;
+ endbyte = LLONG_MAX;
else
endbyte--; /* inclusive */
diff --git a/mm/failslab.c b/mm/failslab.c
index 58df9789f1d2..ffc420c0e767 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -16,6 +16,8 @@ static struct {
bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ int flags = 0;
+
/* No fault-injection for bootstrap cache */
if (unlikely(s == kmem_cache))
return false;
@@ -30,10 +32,16 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
+ /*
+ * In some cases, it expects to specify __GFP_NOWARN
+ * to avoid printing any information(not just a warning),
+ * thus avoiding deadlocks. See commit 6b9dbedbe349 for
+ * details.
+ */
if (gfpflags & __GFP_NOWARN)
- failslab.attr.no_warn = true;
+ flags |= FAULT_NOWARN;
- return should_fail(&failslab.attr, s->object_size);
+ return should_fail_ex(&failslab.attr, s->object_size, flags);
}
static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index 15800334147b..c4d4ace9cc70 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -506,9 +506,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
struct pagevec pvec;
int nr_pages;
- if (end_byte < start_byte)
- return;
-
pagevec_init(&pvec);
while (index <= end) {
unsigned i;
@@ -632,22 +629,23 @@ bool filemap_range_has_writeback(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
if (end_byte < start_byte)
return false;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+ if (folio_test_dirty(folio) || folio_test_locked(folio) ||
+ folio_test_writeback(folio))
break;
}
rcu_read_unlock();
- return page != NULL;
+ return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
@@ -669,6 +667,9 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0, err2;
+ if (lend < lstart)
+ return 0;
+
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
@@ -769,6 +770,9 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
+ if (lend < lstart)
+ return 0;
+
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
@@ -784,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
EXPORT_SYMBOL(file_write_and_wait_range);
/**
- * replace_page_cache_page - replace a pagecache page with a new one
- * @old: page to be replaced
- * @new: page to replace with
- *
- * This function replaces a page in the pagecache with a new one. On
- * success it acquires the pagecache reference for the new page and
- * drops it for the old page. Both the old and new pages must be
- * locked. This function does not add the new page to the LRU, the
+ * replace_page_cache_folio - replace a pagecache folio with a new one
+ * @old: folio to be replaced
+ * @new: folio to replace with
+ *
+ * This function replaces a folio in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new folio and
+ * drops it for the old folio. Both the old and new folios must be
+ * locked. This function does not add the new folio to the LRU, the
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
*/
-void replace_page_cache_page(struct page *old, struct page *new)
+void replace_page_cache_folio(struct folio *old, struct folio *new)
{
- struct folio *fold = page_folio(old);
- struct folio *fnew = page_folio(new);
struct address_space *mapping = old->mapping;
void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
- VM_BUG_ON_PAGE(!PageLocked(old), old);
- VM_BUG_ON_PAGE(!PageLocked(new), new);
- VM_BUG_ON_PAGE(new->mapping, new);
+ VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+ VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+ VM_BUG_ON_FOLIO(new->mapping, new);
- get_page(new);
+ folio_get(new);
new->mapping = mapping;
new->index = offset;
- mem_cgroup_migrate(fold, fnew);
+ mem_cgroup_migrate(old, new);
xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
- if (!PageHuge(old))
- __dec_lruvec_page_state(old, NR_FILE_PAGES);
- if (!PageHuge(new))
- __inc_lruvec_page_state(new, NR_FILE_PAGES);
- if (PageSwapBacked(old))
- __dec_lruvec_page_state(old, NR_SHMEM);
- if (PageSwapBacked(new))
- __inc_lruvec_page_state(new, NR_SHMEM);
+ if (!folio_test_hugetlb(old))
+ __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
+ if (!folio_test_hugetlb(new))
+ __lruvec_stat_add_folio(new, NR_FILE_PAGES);
+ if (folio_test_swapbacked(old))
+ __lruvec_stat_sub_folio(old, NR_SHMEM);
+ if (folio_test_swapbacked(new))
+ __lruvec_stat_add_folio(new, NR_SHMEM);
xas_unlock_irq(&xas);
if (free_folio)
- free_folio(fold);
- folio_put(fold);
+ free_folio(old);
+ folio_put(old);
}
-EXPORT_SYMBOL_GPL(replace_page_cache_page);
+EXPORT_SYMBOL_GPL(replace_page_cache_folio);
noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
@@ -1221,15 +1223,12 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
- bool delayacct = false;
unsigned long pflags;
+ bool in_thrashing;
if (bit_nr == PG_locked &&
!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
- if (!folio_test_swapbacked(folio)) {
- delayacct_thrashing_start();
- delayacct = true;
- }
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
thrashing = true;
}
@@ -1329,8 +1328,7 @@ repeat:
finish_wait(q, wait);
if (thrashing) {
- if (delayacct)
- delayacct_thrashing_end();
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
}
@@ -1378,17 +1376,14 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
- bool delayacct = false;
unsigned long pflags;
+ bool in_thrashing;
wait_queue_head_t *q;
struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
- if (!folio_test_swapbacked(folio)) {
- delayacct_thrashing_start();
- delayacct = true;
- }
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
thrashing = true;
}
@@ -1435,8 +1430,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
finish_wait(q, wait);
if (thrashing) {
- if (delayacct)
- delayacct_thrashing_end();
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
}
}
@@ -1467,7 +1461,7 @@ EXPORT_SYMBOL(folio_wait_bit_killable);
*
* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
*/
-int folio_put_wait_locked(struct folio *folio, int state)
+static int folio_put_wait_locked(struct folio *folio, int state)
{
return folio_wait_bit_common(folio, PG_locked, state, DROP);
}
@@ -1633,24 +1627,26 @@ EXPORT_SYMBOL(folio_end_writeback);
*/
void page_endio(struct page *page, bool is_write, int err)
{
+ struct folio *folio = page_folio(page);
+
if (!is_write) {
if (!err) {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
} else {
- ClearPageUptodate(page);
- SetPageError(page);
+ folio_clear_uptodate(folio);
+ folio_set_error(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
} else {
if (err) {
struct address_space *mapping;
- SetPageError(page);
- mapping = page_mapping(page);
+ folio_set_error(folio);
+ mapping = folio_mapping(folio);
if (mapping)
mapping_set_error(mapping, err);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
}
EXPORT_SYMBOL_GPL(page_endio);
@@ -2053,10 +2049,10 @@ reset:
*
* Return: The number of entries which were found.
*/
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
rcu_read_lock();
@@ -2067,6 +2063,15 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
}
rcu_read_unlock();
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
return folio_batch_count(fbatch);
}
@@ -2090,16 +2095,16 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
*
* Return: The number of entries which were found.
*/
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
if (!xa_is_value(folio)) {
- if (folio->index < start)
+ if (folio->index < *start)
goto put;
if (folio->index + folio_nr_pages(folio) - 1 > end)
goto put;
@@ -2122,6 +2127,15 @@ put:
}
rcu_read_unlock();
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
return folio_batch_count(fbatch);
}
@@ -2195,30 +2209,31 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
}
/**
- * find_get_pages_contig - gang contiguous pagecache lookup
+ * filemap_get_folios_contig - Get a batch of contiguous folios
* @mapping: The address_space to search
- * @index: The starting page index
- * @nr_pages: The maximum number of pages
- * @pages: Where the resulting pages are placed
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @fbatch: The batch to fill
*
- * find_get_pages_contig() works exactly like find_get_pages_range(),
- * except that the returned number of pages are guaranteed to be
- * contiguous.
+ * filemap_get_folios_contig() works exactly like filemap_get_folios(),
+ * except the returned folios are guaranteed to be contiguous. This may
+ * not return all contiguous folios if the batch gets filled up.
*
- * Return: the number of pages which were found.
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
*/
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
- unsigned int nr_pages, struct page **pages)
+
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, index);
+ XA_STATE(xas, &mapping->i_pages, *start);
+ unsigned long nr;
struct folio *folio;
- unsigned int ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
rcu_read_lock();
- for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
+
+ for (folio = xas_load(&xas); folio && xas.xa_index <= end;
+ folio = xas_next(&xas)) {
if (xas_retry(&xas, folio))
continue;
/*
@@ -2226,33 +2241,45 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
* No current caller is looking for DAX entries.
*/
if (xa_is_value(folio))
- break;
+ goto update_start;
if (!folio_try_get_rcu(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
- goto put_page;
+ goto put_folio;
-again:
- pages[ret] = folio_file_page(folio, xas.xa_index);
- if (++ret == nr_pages)
- break;
- if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
- xas.xa_index++;
- folio_ref_inc(folio);
- goto again;
+ if (!folio_batch_add(fbatch, folio)) {
+ nr = folio_nr_pages(folio);
+
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
+ goto out;
}
continue;
-put_page:
+put_folio:
folio_put(folio);
+
retry:
xas_reset(&xas);
}
+
+update_start:
+ nr = folio_batch_count(fbatch);
+
+ if (nr) {
+ folio = fbatch->folios[nr - 1];
+ if (folio_test_hugetlb(folio))
+ *start = folio->index + 1;
+ else
+ *start = folio->index + folio_nr_pages(folio);
+ }
+out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
}
-EXPORT_SYMBOL(find_get_pages_contig);
+EXPORT_SYMBOL(filemap_get_folios_contig);
/**
* find_get_pages_range_tag - Find and return head pages matching @tag.
@@ -2382,6 +2409,8 @@ retry:
static int filemap_read_folio(struct file *file, filler_t filler,
struct folio *folio)
{
+ bool workingset = folio_test_workingset(folio);
+ unsigned long pflags;
int error;
/*
@@ -2390,8 +2419,13 @@ static int filemap_read_folio(struct file *file, filler_t filler,
* fails.
*/
folio_clear_error(folio);
+
/* Start the actual read. The read will unlock the page. */
+ if (unlikely(workingset))
+ psi_memstall_enter(&pflags);
error = filler(file, folio);
+ if (unlikely(workingset))
+ psi_memstall_leave(&pflags);
if (error)
return error;
@@ -3712,7 +3746,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
- void *fsdata;
+ void *fsdata = NULL;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 458618c7302c..69ed25790c68 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -39,12 +39,6 @@ void wait_for_stable_page(struct page *page)
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
-bool page_mapped(struct page *page)
-{
- return folio_mapped(page_folio(page));
-}
-EXPORT_SYMBOL(page_mapped);
-
void mark_page_accessed(struct page *page)
{
folio_mark_accessed(page_folio(page));
@@ -82,11 +76,11 @@ bool redirty_page_for_writepage(struct writeback_control *wbc,
}
EXPORT_SYMBOL(redirty_page_for_writepage);
-void lru_cache_add(struct page *page)
+void lru_cache_add_inactive_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
{
- folio_add_lru(page_folio(page));
+ folio_add_lru_vma(page_folio(page), vma);
}
-EXPORT_SYMBOL(lru_cache_add);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp)
@@ -102,7 +96,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
struct folio *folio;
folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
- if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio))
+ if (!folio || xa_is_value(folio))
return &folio->page;
return folio_file_page(folio, index);
}
@@ -118,17 +112,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-void delete_from_page_cache(struct page *page)
-{
- return filemap_remove_folio(page_folio(page));
-}
-
-int try_to_release_page(struct page *page, gfp_t gfp)
-{
- return filemap_release_folio(page_folio(page), gfp);
-}
-EXPORT_SYMBOL(try_to_release_page);
-
int isolate_lru_page(struct page *page)
{
if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1a97610308cb..279e55b4ed87 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
+
+ if (!frontswap_enabled())
+ return;
frontswap_ops->init(type);
}
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..f45a3a5be53a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -123,6 +123,9 @@ retry:
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return NULL;
+
if (flags & FOLL_GET)
return try_get_folio(page, refs);
else if (flags & FOLL_PIN) {
@@ -158,6 +161,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
else
folio_ref_add(folio,
refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
+
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
return folio;
@@ -195,17 +205,22 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
* time. Cases: please see the try_grab_folio() documentation, with
* "refs=1".
*
- * Return: true for success, or if no action was required (if neither FOLL_PIN
- * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
- * FOLL_PIN was set, but the page could not be grabbed.
+ * Return: 0 for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). A negative error code for failure:
+ *
+ * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
+ * be grabbed.
*/
-bool __must_check try_grab_page(struct page *page, unsigned int flags)
+int __must_check try_grab_page(struct page *page, unsigned int flags)
{
struct folio *folio = page_folio(page);
WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
- return false;
+ return -ENOMEM;
+
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return -EREMOTEIO;
if (flags & FOLL_GET)
folio_ref_inc(folio);
@@ -225,7 +240,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
}
- return true;
+ return 0;
}
/**
@@ -530,31 +545,14 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
-retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto retry;
- }
- if ((flags & FOLL_NUMA) && pte_protnone(pte))
+ if (!pte_present(pte))
+ goto no_page;
+ if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto no_page;
page = vm_normal_page(vma, address, pte);
@@ -596,7 +594,7 @@ retry:
}
}
- if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
page = ERR_PTR(-EMLINK);
goto out;
}
@@ -605,10 +603,12 @@ retry:
!PageAnonExclusive(page), page);
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- if (unlikely(!try_grab_page(page, flags))) {
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (unlikely(ret)) {
+ page = ERR_PTR(ret);
goto out;
}
+
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
@@ -661,42 +661,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd(mm, address, pmd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(pmdval)), flags,
- PMD_SHIFT);
- if (page)
- return page;
+ if (!pmd_present(pmdval))
return no_page_table(vma, flags);
- }
-retry:
- if (!pmd_present(pmdval)) {
- /*
- * Should never reach here, if thp migration is not supported;
- * Otherwise, it must be a thp migration entry.
- */
- VM_BUG_ON(!thp_migration_supported() ||
- !is_pmd_migration_entry(pmdval));
-
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
-
- pmd_migration_entry_wait(mm, pmd);
- pmdval = READ_ONCE(*pmd);
- /*
- * MADV_DONTNEED may convert the pmd to null because
- * mmap_lock is held in read mode
- */
- if (pmd_none(pmdval))
- return no_page_table(vma, flags);
- goto retry;
- }
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
@@ -707,21 +673,13 @@ retry:
if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
- if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
+ if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
return no_page_table(vma, flags);
-retry_locked:
ptl = pmd_lock(mm, pmd);
- if (unlikely(pmd_none(*pmd))) {
- spin_unlock(ptl);
- return no_page_table(vma, flags);
- }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
- pmd_migration_entry_wait(mm, pmd);
- goto retry_locked;
+ return no_page_table(vma, flags);
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
@@ -764,20 +722,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pud(mm, address, pud, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pud_val(*pud)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pud_val(*pud)), flags,
- PUD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@ -797,7 +741,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
struct follow_page_context *ctx)
{
p4d_t *p4d;
- struct page *page;
p4d = p4d_offset(pgdp, address);
if (p4d_none(*p4d))
@@ -806,14 +749,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
if (unlikely(p4d_bad(*p4d)))
return no_page_table(vma, flags);
- if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(p4d_val(*p4d)), flags,
- P4D_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
return follow_pud_mask(vma, address, p4d, flags, ctx);
}
@@ -851,10 +786,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
ctx->page_mask = 0;
- /* make this handle hugepd */
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+ /*
+ * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+ * special hugetlb page table walking code. This eliminates the
+ * need to check for hugetlb entries in the general walking code.
+ *
+ * hugetlb_follow_page_mask is only for follow_page() handling here.
+ * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = hugetlb_follow_page_mask(vma, address, flags);
+ if (!page)
+ page = no_page_table(vma, flags);
return page;
}
@@ -863,21 +806,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
- if (pgd_huge(*pgd)) {
- page = follow_huge_pgd(mm, address, pgd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pgd_val(*pgd)), flags,
- PGDIR_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
return follow_p4d_mask(vma, address, pgd, flags, ctx);
}
@@ -941,10 +869,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
goto unmap;
*page = pte_page(*pte);
}
- if (unlikely(!try_grab_page(*page, gup_flags))) {
- ret = -ENOMEM;
+ ret = try_grab_page(*page, gup_flags);
+ if (unlikely(ret))
goto unmap;
- }
out:
ret = 0;
unmap:
@@ -970,8 +897,17 @@ static int faultin_page(struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
- if (locked)
+ if (locked) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ /*
+ * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
+ * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
+ * That's because some callers may not be prepared to
+ * handle early exits caused by non-fatal signals.
+ */
+ if (*flags & FOLL_INTERRUPTIBLE)
+ fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+ }
if (*flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
if (*flags & FOLL_TRIED) {
@@ -1039,6 +975,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -1046,6 +985,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
+ /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
+ if (is_vm_hugetlb_page(vma))
+ return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
@@ -1153,14 +1095,6 @@ static long __get_user_pages(struct mm_struct *mm,
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
- /*
- * If FOLL_FORCE is set then do not force a full fault as the hinting
- * fault information is unrelated to the reference behaviour of a task
- * using the address space
- */
- if (!(gup_flags & FOLL_FORCE))
- gup_flags |= FOLL_NUMA;
-
do {
struct page *page;
unsigned int foll_flags = gup_flags;
@@ -1381,6 +1315,22 @@ retry:
EXPORT_SYMBOL_GPL(fixup_user_fault);
/*
+ * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
+ * specified, it'll also respond to generic signals. The caller of GUP
+ * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
+ */
+static bool gup_signal_pending(unsigned int flags)
+{
+ if (fatal_signal_pending(current))
+ return true;
+
+ if (!(flags & FOLL_INTERRUPTIBLE))
+ return false;
+
+ return signal_pending(current);
+}
+
+/*
* Please note that this function, unlike __get_user_pages will not
* return 0 for nr_pages > 0 without FOLL_NOWAIT
*/
@@ -1461,11 +1411,11 @@ retry:
* Repeat on the address that fired VM_FAULT_RETRY
* with both FAULT_FLAG_ALLOW_RETRY and
* FAULT_FLAG_TRIED. Note that GUP can be interrupted
- * by fatal signals, so we need to check it before we
+ * by fatal signals of even common signals, depending on
+ * the caller's request. So we need to check it before we
* start trying again otherwise it can loop forever.
*/
-
- if (fatal_signal_pending(current)) {
+ if (gup_signal_pending(flags)) {
if (!pages_done)
pages_done = -EINTR;
break;
@@ -1667,10 +1617,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
if (!locked) {
locked = 1;
mmap_read_lock(mm);
- vma = find_vma(mm, nstart);
+ vma = find_vma_intersection(mm, nstart, end);
} else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
+ vma = find_vma_intersection(mm, vma->vm_end, end);
+
+ if (!vma)
break;
/*
* Set [nstart; nend) to intersection of desired address
@@ -1927,20 +1878,16 @@ struct page *get_dump_page(unsigned long addr)
#ifdef CONFIG_MIGRATION
/*
- * Check whether all pages are pinnable, if so return number of pages. If some
- * pages are not pinnable, migrate them, and unpin all pages. Return zero if
- * pages were migrated, or if some pages were not successfully isolated.
- * Return negative error if migration fails.
+ * Returns the number of collected pages. Return value is always >= 0.
*/
-static long check_and_migrate_movable_pages(unsigned long nr_pages,
- struct page **pages,
- unsigned int gup_flags)
+static unsigned long collect_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
{
- unsigned long isolation_error_count = 0, i;
+ unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
- LIST_HEAD(movable_page_list);
- bool drain_allow = true, coherent_pages = false;
- int ret = 0;
+ bool drain_allow = true;
for (i = 0; i < nr_pages; i++) {
struct folio *folio = page_folio(pages[i]);
@@ -1949,45 +1896,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_folio = folio;
- /*
- * Device coherent pages are managed by a driver and should not
- * be pinned indefinitely as it prevents the driver moving the
- * page. So when trying to pin with FOLL_LONGTERM instead try
- * to migrate the page out of device memory.
- */
- if (folio_is_device_coherent(folio)) {
- /*
- * We always want a new GUP lookup with device coherent
- * pages.
- */
- pages[i] = 0;
- coherent_pages = true;
-
- /*
- * Migration will fail if the page is pinned, so convert
- * the pin on the source page to a normal reference.
- */
- if (gup_flags & FOLL_PIN) {
- get_page(&folio->page);
- unpin_user_page(&folio->page);
- }
+ if (folio_is_longterm_pinnable(folio))
+ continue;
- ret = migrate_device_coherent_page(&folio->page);
- if (ret)
- goto unpin_pages;
+ collected++;
+ if (folio_is_device_coherent(folio))
continue;
- }
- if (folio_is_longterm_pinnable(folio))
- continue;
- /*
- * Try to move out any movable page before pinning the range.
- */
if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(&folio->page,
- &movable_page_list))
- isolation_error_count++;
+ isolate_hugetlb(&folio->page, movable_page_list);
continue;
}
@@ -1996,63 +1914,124 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
drain_allow = false;
}
- if (folio_isolate_lru(folio)) {
- isolation_error_count++;
+ if (!folio_isolate_lru(folio))
continue;
- }
- list_add_tail(&folio->lru, &movable_page_list);
+
+ list_add_tail(&folio->lru, movable_page_list);
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
- if (!list_empty(&movable_page_list) || isolation_error_count ||
- coherent_pages)
- goto unpin_pages;
+ return collected;
+}
- /*
- * If list is empty, and no isolation errors, means that all pages are
- * in the correct zone.
- */
- return nr_pages;
+/*
+ * Unpins all pages and migrates device coherent pages and movable_page_list.
+ * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
+ * (or partial success).
+ */
+static int migrate_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
+{
+ int ret;
+ unsigned long i;
-unpin_pages:
- /*
- * pages[i] might be NULL if any device coherent pages were found.
- */
for (i = 0; i < nr_pages; i++) {
- if (!pages[i])
+ struct folio *folio = page_folio(pages[i]);
+
+ if (folio_is_device_coherent(folio)) {
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ pages[i] = NULL;
+ folio_get(folio);
+ gup_put_folio(folio, 1, FOLL_PIN);
+
+ if (migrate_device_coherent_page(&folio->page)) {
+ ret = -EBUSY;
+ goto err;
+ }
+
continue;
+ }
- if (gup_flags & FOLL_PIN)
- unpin_user_page(pages[i]);
- else
- put_page(pages[i]);
+ /*
+ * We can't migrate pages with unexpected references, so drop
+ * the reference obtained by __get_user_pages_locked().
+ * Migrating pages have been added to movable_page_list after
+ * calling folio_isolate_lru() which takes a reference so the
+ * page won't be freed if it's migrating.
+ */
+ unpin_user_page(pages[i]);
+ pages[i] = NULL;
}
- if (!list_empty(&movable_page_list)) {
+ if (!list_empty(movable_page_list)) {
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_NOWARN,
};
- ret = migrate_pages(&movable_page_list, alloc_migration_target,
- NULL, (unsigned long)&mtc, MIGRATE_SYNC,
- MR_LONGTERM_PIN, NULL);
- if (ret > 0) /* number of pages not migrated */
+ if (migrate_pages(movable_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_LONGTERM_PIN, NULL)) {
ret = -ENOMEM;
+ goto err;
+ }
}
- if (ret && !list_empty(&movable_page_list))
- putback_movable_pages(&movable_page_list);
+ putback_movable_pages(movable_page_list);
+
+ return -EAGAIN;
+
+err:
+ for (i = 0; i < nr_pages; i++)
+ if (pages[i])
+ unpin_user_page(pages[i]);
+ putback_movable_pages(movable_page_list);
+
return ret;
}
+
+/*
+ * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
+ * pages in the range are required to be pinned via FOLL_PIN, before calling
+ * this routine.
+ *
+ * If any pages in the range are not allowed to be pinned, then this routine
+ * will migrate those pages away, unpin all the pages in the range and return
+ * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
+ * call this routine again.
+ *
+ * If an error other than -EAGAIN occurs, this indicates a migration failure.
+ * The caller should give up, and propagate the error back up the call stack.
+ *
+ * If everything is OK and all pages in the range are allowed to be pinned, then
+ * this routine leaves all pages pinned and returns zero for success.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages)
+{
+ unsigned long collected;
+ LIST_HEAD(movable_page_list);
+
+ collected = collect_longterm_unpinnable_pages(&movable_page_list,
+ nr_pages, pages);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
+ pages);
+}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
- struct page **pages,
- unsigned int gup_flags)
+ struct page **pages)
{
- return nr_pages;
+ return 0;
}
#endif /* CONFIG_MIGRATION */
@@ -2065,25 +2044,53 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
+ int *locked,
unsigned int gup_flags)
{
+ bool must_unlock = false;
unsigned int flags;
- long rc;
+ long rc, nr_pinned_pages;
+
+ if (locked && WARN_ON_ONCE(!*locked))
+ return -EINVAL;
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
+ locked, gup_flags);
+
+ /*
+ * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
+ * implies FOLL_PIN (although the reverse is not true). Therefore it is
+ * correct to unconditionally call check_and_migrate_movable_pages()
+ * which assumes pages have been pinned via FOLL_PIN.
+ *
+ * Enforce the above reasoning by asserting that FOLL_PIN is set.
+ */
+ if (WARN_ON(!(gup_flags & FOLL_PIN)))
+ return -EINVAL;
flags = memalloc_pin_save();
do {
- rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
- if (rc <= 0)
+ if (locked && !*locked) {
+ mmap_read_lock(mm);
+ must_unlock = true;
+ *locked = 1;
+ }
+ nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
+ pages, vmas, locked,
+ gup_flags);
+ if (nr_pinned_pages <= 0) {
+ rc = nr_pinned_pages;
break;
- rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
- } while (!rc);
+ }
+ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
+ } while (rc == -EAGAIN);
memalloc_pin_restore(flags);
- return rc;
+ if (locked && *locked && must_unlock) {
+ mmap_read_unlock(mm);
+ *locked = 0;
+ }
+ return rc ? rc : nr_pinned_pages;
}
static bool is_valid_gup_flags(unsigned int gup_flags)
@@ -2106,35 +2113,6 @@ static bool is_valid_gup_flags(unsigned int gup_flags)
}
#ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- /*
- * Parts of FOLL_LONGTERM behavior are incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. However, this only comes up if locked is set, and there are
- * callers that do request FOLL_LONGTERM, but do not set locked. So,
- * allow what we can.
- */
- if (gup_flags & FOLL_LONGTERM) {
- if (WARN_ON_ONCE(locked))
- return -EINVAL;
- /*
- * This will check the vmas (even if our vmas arg is NULL)
- * and return -ENOTSUPP if DAX isn't allowed in this case:
- */
- return __gup_longterm_locked(mm, start, nr_pages, pages,
- vmas, gup_flags | FOLL_TOUCH |
- FOLL_REMOTE);
- }
-
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
-}
-
/**
* get_user_pages_remote() - pin user pages in memory
* @mm: mm_struct of target mm
@@ -2203,8 +2181,8 @@ long get_user_pages_remote(struct mm_struct *mm,
if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
- return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
- pages, vmas, locked);
+ return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -2216,14 +2194,6 @@ long get_user_pages_remote(struct mm_struct *mm,
{
return 0;
}
-
-static long __get_user_pages_remote(struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- return 0;
-}
#endif /* !CONFIG_MMU */
/**
@@ -2250,7 +2220,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
return -EINVAL;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, gup_flags | FOLL_TOUCH);
+ pages, vmas, NULL, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -2276,18 +2246,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int locked = 1;
long ret;
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
mmap_read_lock(mm);
- ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
- &locked, gup_flags | FOLL_TOUCH);
+ ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked,
+ gup_flags | FOLL_TOUCH);
if (locked)
mmap_read_unlock(mm);
return ret;
@@ -2345,8 +2306,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2358,11 +2339,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
struct page *page;
struct folio *folio;
- /*
- * Similar to the PMD case below, NUMA hinting must take slow
- * path using the pte_protnone check.
- */
- if (pte_protnone(pte))
+ if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto pte_unmap;
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
@@ -2392,12 +2369,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
- if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2417,9 @@ pte_unmap:
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2462,9 +2441,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
+
+ if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ break;
+ }
+
SetPageReferenced(page);
pages[*nr] = page;
- if (unlikely(!try_grab_page(page, flags))) {
+ if (unlikely(try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
@@ -2582,7 +2567,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
- if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2648,7 +2633,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
- if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2688,7 +2673,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
- if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2736,7 +2721,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
- pmd_t pmd = READ_ONCE(*pmdp);
+ pmd_t pmd = pmdp_get_lockless(pmdp);
next = pmd_addr_end(addr, end);
if (!pmd_present(pmd))
@@ -2744,12 +2729,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
pmd_devmap(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
+ if (pmd_protnone(pmd) &&
+ !gup_can_follow_protnone(flags))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
@@ -2764,7 +2745,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -2784,7 +2765,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo
next = pud_addr_end(addr, end);
if (unlikely(!pud_present(pud)))
return 0;
- if (unlikely(pud_huge(pud))) {
+ if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
@@ -2867,29 +2848,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
-{
- int ret;
-
- /*
- * FIXME: FOLL_LONGTERM does not work with
- * get_user_pages_unlocked() (see comments in that function)
- */
- if (gup_flags & FOLL_LONGTERM) {
- mmap_read_lock(current->mm);
- ret = __gup_longterm_locked(current->mm,
- start, nr_pages,
- pages, NULL, gup_flags);
- mmap_read_unlock(current->mm);
- } else {
- ret = get_user_pages_unlocked(start, nr_pages,
- pages, gup_flags);
- }
-
- return ret;
-}
-
static unsigned long lockless_pages_from_mm(unsigned long start,
unsigned long end,
unsigned int gup_flags,
@@ -2950,7 +2908,8 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY | FOLL_NOFAULT)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT |
+ FOLL_PCI_P2PDMA)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
@@ -2973,8 +2932,8 @@ static int internal_get_user_pages_fast(unsigned long start,
/* Slow path: try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
- ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
- pages);
+ ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages,
+ gup_flags);
if (ret < 0) {
/*
* The caller has to unpin the pages we already pinned so
@@ -3173,9 +3132,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
if (WARN_ON_ONCE(!pages))
return -EINVAL;
- gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
- pages, vmas, locked);
+ return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked,
+ gup_flags | FOLL_PIN | FOLL_TOUCH |
+ FOLL_REMOTE);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -3209,7 +3168,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
gup_flags |= FOLL_PIN;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, gup_flags);
+ pages, vmas, NULL, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
diff --git a/mm/gup_test.c b/mm/gup_test.c
index 12b0a91767d3..8ae7307a1bb6 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -4,6 +4,7 @@
#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/debugfs.h>
+#include <linux/highmem.h>
#include "gup_test.h"
static void put_back_pages(unsigned int cmd, struct page **pages,
@@ -203,6 +204,138 @@ free_pages:
return ret;
}
+static DEFINE_MUTEX(pin_longterm_test_mutex);
+static struct page **pin_longterm_test_pages;
+static unsigned long pin_longterm_test_nr_pages;
+
+static inline void pin_longterm_test_stop(void)
+{
+ if (pin_longterm_test_pages) {
+ if (pin_longterm_test_nr_pages)
+ unpin_user_pages(pin_longterm_test_pages,
+ pin_longterm_test_nr_pages);
+ kvfree(pin_longterm_test_pages);
+ pin_longterm_test_pages = NULL;
+ pin_longterm_test_nr_pages = 0;
+ }
+}
+
+static inline int pin_longterm_test_start(unsigned long arg)
+{
+ long nr_pages, cur_pages, addr, remaining_pages;
+ int gup_flags = FOLL_LONGTERM;
+ struct pin_longterm_test args;
+ struct page **pages;
+ int ret = 0;
+ bool fast;
+
+ if (pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
+ return -EFAULT;
+
+ if (args.flags &
+ ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST))
+ return -EINVAL;
+ if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE))
+ return -EINVAL;
+ if (args.size > LONG_MAX)
+ return -EINVAL;
+ nr_pages = args.size / PAGE_SIZE;
+ if (!nr_pages)
+ return -EINVAL;
+
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE)
+ gup_flags |= FOLL_WRITE;
+ fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST);
+
+ if (!fast && mmap_read_lock_killable(current->mm)) {
+ kvfree(pages);
+ return -EINTR;
+ }
+
+ pin_longterm_test_pages = pages;
+ pin_longterm_test_nr_pages = 0;
+
+ while (nr_pages - pin_longterm_test_nr_pages) {
+ remaining_pages = nr_pages - pin_longterm_test_nr_pages;
+ addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE;
+
+ if (fast)
+ cur_pages = pin_user_pages_fast(addr, remaining_pages,
+ gup_flags, pages);
+ else
+ cur_pages = pin_user_pages(addr, remaining_pages,
+ gup_flags, pages, NULL);
+ if (cur_pages < 0) {
+ pin_longterm_test_stop();
+ ret = cur_pages;
+ break;
+ }
+ pin_longterm_test_nr_pages += cur_pages;
+ pages += cur_pages;
+ }
+
+ if (!fast)
+ mmap_read_unlock(current->mm);
+ return ret;
+}
+
+static inline int pin_longterm_test_read(unsigned long arg)
+{
+ __u64 user_addr;
+ unsigned long i;
+
+ if (!pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr)))
+ return -EFAULT;
+
+ for (i = 0; i < pin_longterm_test_nr_pages; i++) {
+ void *addr = kmap_local_page(pin_longterm_test_pages[i]);
+ unsigned long ret;
+
+ ret = copy_to_user((void __user *)(unsigned long)user_addr, addr,
+ PAGE_SIZE);
+ kunmap_local(addr);
+ if (ret)
+ return -EFAULT;
+ user_addr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+
+ if (mutex_lock_killable(&pin_longterm_test_mutex))
+ return -EINTR;
+
+ switch (cmd) {
+ case PIN_LONGTERM_TEST_START:
+ ret = pin_longterm_test_start(arg);
+ break;
+ case PIN_LONGTERM_TEST_STOP:
+ pin_longterm_test_stop();
+ ret = 0;
+ break;
+ case PIN_LONGTERM_TEST_READ:
+ ret = pin_longterm_test_read(arg);
+ break;
+ }
+
+ mutex_unlock(&pin_longterm_test_mutex);
+ return ret;
+}
+
static long gup_test_ioctl(struct file *filep, unsigned int cmd,
unsigned long arg)
{
@@ -217,6 +350,10 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd,
case PIN_BASIC_TEST:
case DUMP_USER_PAGES_TEST:
break;
+ case PIN_LONGTERM_TEST_START:
+ case PIN_LONGTERM_TEST_STOP:
+ case PIN_LONGTERM_TEST_READ:
+ return pin_longterm_test_ioctl(filep, cmd, arg);
default:
return -EINVAL;
}
@@ -234,9 +371,17 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd,
return 0;
}
+static int gup_test_release(struct inode *inode, struct file *file)
+{
+ pin_longterm_test_stop();
+
+ return 0;
+}
+
static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
.unlocked_ioctl = gup_test_ioctl,
+ .release = gup_test_release,
};
static int __init gup_test_init(void)
diff --git a/mm/gup_test.h b/mm/gup_test.h
index 887ac1d5f5bc..5b37b54e8bea 100644
--- a/mm/gup_test.h
+++ b/mm/gup_test.h
@@ -10,6 +10,9 @@
#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test)
+#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test)
+#define PIN_LONGTERM_TEST_STOP _IO('g', 8)
+#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64)
#define GUP_TEST_MAX_PAGES_TO_DUMP 8
@@ -30,4 +33,13 @@ struct gup_test {
__u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP];
};
+#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1
+#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2
+
+struct pin_longterm_test {
+ __u64 addr;
+ __u64 size;
+ __u32 flags;
+};
+
#endif /* __GUP_TEST_H */
diff --git a/mm/highmem.c b/mm/highmem.c
index c707d7202d5f..db251e77f98f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,6 +30,17 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
+#ifdef CONFIG_KMAP_LOCAL
+static inline int kmap_local_calc_idx(int idx)
+{
+ return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
+#endif
+#endif /* CONFIG_KMAP_LOCAL */
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -142,12 +153,29 @@ pte_t *pkmap_page_table;
struct page *__kmap_to_page(void *vaddr)
{
+ unsigned long base = (unsigned long) vaddr & PAGE_MASK;
+ struct kmap_ctrl *kctrl = &current->kmap_ctrl;
unsigned long addr = (unsigned long)vaddr;
+ int i;
+
+ /* kmap() mappings */
+ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
+ addr < PKMAP_ADDR(LAST_PKMAP)))
+ return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
- if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
- int i = PKMAP_NR(addr);
+ /* kmap_local_page() mappings */
+ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
+ base < __fix_to_virt(FIX_KMAP_BEGIN))) {
+ for (i = 0; i < kctrl->idx; i++) {
+ unsigned long base_addr;
+ int idx;
- return pte_page(pkmap_page_table[i]);
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+
+ if (base_addr == base)
+ return pte_page(kctrl->pteval[i]);
+ }
}
return virt_to_page(vaddr);
@@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void)
# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
#endif
-#ifndef arch_kmap_local_map_idx
-#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
-#endif
-
#ifndef arch_kmap_local_unmap_idx
#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
#endif
@@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
return false;
}
-static inline int kmap_local_calc_idx(int idx)
-{
- return idx + KM_MAX_IDX * smp_processor_id();
-}
-
static pte_t *__kmap_pte;
static pte_t *kmap_get_pte(unsigned long vaddr, int idx)
diff --git a/mm/hmm.c b/mm/hmm.c
index f2aa63b94d9b..601a99ce3c84 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -253,7 +253,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset(entry) | cpu_flags;
+ *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
return 0;
}
@@ -361,8 +361,7 @@ again:
* huge or device mapping one and compute corresponding pfn
* values.
*/
- pmd = pmd_read_atomic(pmdp);
- barrier();
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..abe6cfd92ffa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -36,6 +36,7 @@
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -70,9 +71,8 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs)
{
if (!vma->vm_mm) /* vdso */
return false;
@@ -119,13 +119,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_huge_enabled(vma);
+ return shmem_huge_enabled(vma, !enforce_sysfs);
- if (!hugepage_flags_enabled())
- return false;
-
- /* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+ /* Enforce sysfs THP requirements as necessary */
+ if (enforce_sysfs &&
+ (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
+ !hugepage_flags_always())))
return false;
/* Only regular file is valid */
@@ -164,7 +163,6 @@ retry:
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
return false;
}
- count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
@@ -176,6 +174,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
return true;
}
@@ -772,8 +771,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- if (pgtable)
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
}
@@ -1037,6 +1035,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -1068,8 +1067,9 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
@@ -1195,6 +1195,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pud_lockptr(mm, pud));
@@ -1228,8 +1229,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
@@ -1307,6 +1310,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t orig_pmd = vmf->orig_pmd;
@@ -1314,9 +1318,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
-
if (is_huge_zero_pmd(orig_pmd))
goto fallback;
@@ -1328,46 +1329,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
page = pmd_page(orig_pmd);
+ folio = page_folio(page);
VM_BUG_ON_PAGE(!PageHead(page), page);
/* Early check when only holding the PT lock. */
if (PageAnonExclusive(page))
goto reuse;
- if (!trylock_page(page)) {
- get_page(page);
+ if (!folio_trylock(folio)) {
+ folio_get(folio);
spin_unlock(vmf->ptl);
- lock_page(page);
+ folio_lock(folio);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
}
- put_page(page);
+ folio_put(folio);
}
/* Recheck after temporarily dropping the PT lock. */
if (PageAnonExclusive(page)) {
- unlock_page(page);
+ folio_unlock(folio);
goto reuse;
}
/*
- * See do_wp_page(): we can only reuse the page exclusively if there are
- * no additional references. Note that we always drain the LRU
- * pagevecs immediately after adding a THP.
+ * See do_wp_page(): we can only reuse the folio exclusively if
+ * there are no additional references. Note that we always drain
+ * the LRU pagevecs immediately after adding a THP.
*/
- if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ if (folio_ref_count(folio) >
+ 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
goto unlock_fallback;
- if (PageSwapCache(page))
- try_to_free_swap(page);
- if (page_count(page) == 1) {
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_ref_count(folio) == 1) {
pmd_t entry;
page_move_anon_rmap(page, vma);
- unlock_page(page);
+ folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
spin_unlock(vmf->ptl);
@@ -1378,17 +1381,47 @@ reuse:
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- return VM_FAULT_WRITE;
+ return 0;
}
unlock_fallback:
- unlock_page(page);
+ folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
+static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ struct page *page;
+
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
+ return false;
+
+ /* Don't touch entries that are not even readable (NUMA hinting). */
+ if (pmd_protnone(pmd))
+ return false;
+
+ /* Do we need write faults for softdirty tracking? */
+ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ return false;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_huge_pmd_wp(vma, pmd))
+ return false;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ /* See can_change_pte_writable(). */
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+ }
+
+ /* See can_change_pte_writable(). */
+ return pmd_dirty(pmd);
+}
+
/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
struct vm_area_struct *vma,
@@ -1434,6 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -1449,17 +1483,18 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
return NULL;
- if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
+ if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- if (!try_grab_page(page, flags))
- return ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ return ERR_PTR(ret);
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
@@ -1479,9 +1514,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = NUMA_NO_NODE;
- int target_nid, last_cpupid = -1;
- bool migrated = false;
- bool was_writable = pmd_savedwrite(oldpmd);
+ int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
+ bool migrated = false, writable = false;
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1491,16 +1525,31 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
}
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+
+ /*
+ * Detect now whether the PMD could be writable; this information
+ * is only valid while holding the PT lock.
+ */
+ writable = pmd_write(pmd);
+ if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pmd_writable(vma, vmf->address, pmd))
+ writable = true;
+
page = vm_normal_page_pmd(vma, haddr, pmd);
if (!page)
goto out_map;
/* See similar comment in do_numa_page for explanation */
- if (!was_writable)
+ if (!writable)
flags |= TNF_NO_GROUP;
page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if (node_is_toptier(page_nid))
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
&flags);
@@ -1510,6 +1559,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
}
spin_unlock(vmf->ptl);
+ writable = false;
migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
@@ -1536,7 +1586,7 @@ out_map:
/* Restore the PMD */
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
- if (was_writable)
+ if (writable)
pmd = pmd_mkwrite(pmd);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@ -1777,11 +1827,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t oldpmd, entry;
- bool preserve_write;
- int ret;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ int ret = 1;
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
@@ -1792,9 +1841,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
- preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -1824,6 +1870,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (prot_numa) {
struct page *page;
+ bool toptier;
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -1836,13 +1883,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto unlock;
page = pmd_page(*pmd);
+ toptier = node_is_toptier(page_to_nid(page));
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(page_to_nid(page)))
+ toptier)
goto unlock;
+
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page, jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -1868,8 +1920,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
entry = pmd_modify(oldpmd, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
if (uffd_wp) {
entry = pmd_wrprotect(entry);
entry = pmd_mkuffd_wp(entry);
@@ -1881,13 +1931,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
entry = pmd_clear_uffd_wp(entry);
}
+
+ /* See change_pte_range(). */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
+ can_change_pmd_writable(vma, addr, entry))
+ entry = pmd_mkwrite(entry);
+
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
if (huge_pmd_needs_flush(oldpmd, entry))
tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
-
- BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
unlock:
spin_unlock(ptl);
return ret;
@@ -2029,7 +2083,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false;
+ bool anon_exclusive = false, dirty = false;
unsigned long addr;
int i;
@@ -2113,20 +2167,22 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_writable_migration_entry(entry);
if (PageAnon(page))
anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = false;
+ young = is_migration_entry_young(entry);
+ dirty = is_migration_entry_dirty(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
- if (pmd_dirty(old_pmd))
+ if (pmd_dirty(old_pmd)) {
+ dirty = true;
SetPageDirty(page);
+ }
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- page_ref_add(page, HPAGE_PMD_NR - 1);
/*
* Without "freeze", we'll simply split the PMD, propagating the
@@ -2140,10 +2196,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
freeze = false;
+ if (!freeze)
+ page_ref_add(page, HPAGE_PMD_NR - 1);
}
/*
@@ -2171,6 +2231,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -2181,60 +2245,37 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = maybe_mkwrite(entry, vma);
if (anon_exclusive)
SetPageAnonExclusive(page + i);
- if (!write)
- entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
+ /*
+ * NOTE: this needs to happen after pte_mkdirty,
+ * because some archs (sparc64, loongarch) could
+ * set hw write bit when mkdirty.
+ */
+ if (!write)
+ entry = pte_wrprotect(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_mkuffd_wp(entry);
+ page_add_anon_rmap(page + i, vma, addr, false);
}
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
- if (!pmd_migration)
- atomic_inc(&page[i]._mapcount);
pte_unmap(pte);
}
- if (!pmd_migration) {
- /*
- * Set PG_double_map before dropping compound_mapcount to avoid
- * false-negative page_mapped().
- */
- if (compound_mapcount(page) > 1 &&
- !TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_inc(&page[i]._mapcount);
- }
-
- lock_page_memcg(page);
- if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
- /* Last compound_mapcount is gone. */
- __mod_lruvec_page_state(page, NR_ANON_THPS,
- -HPAGE_PMD_NR);
- if (TestClearPageDoubleMap(page)) {
- /* No need in mapcount reference anymore */
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
- }
- }
- unlock_page_memcg(page);
-
- /* Above is effectively page_remove_rmap(page, vma, true) */
- munlock_vma_page(page, vma, true);
- }
+ if (!pmd_migration)
+ page_remove_rmap(page, vma, true);
+ if (freeze)
+ put_page(page);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
-
- if (freeze) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- page_remove_rmap(page + i, vma, false);
- put_page(page + i);
- }
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -2288,25 +2329,11 @@ out:
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
bool freeze, struct folio *folio)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
+ pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
+ if (!pmd)
return;
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
-
__split_huge_pmd(vma, pmd, address, freeze, folio);
}
@@ -2334,24 +2361,23 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start,
+ * If we're also updating the next vma vm_start,
* check if we need to split it.
*/
if (adjust_next > 0) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
}
}
-static void unmap_page(struct page *page)
+static void unmap_folio(struct folio *folio)
{
- struct folio *folio = page_folio(page);
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC;
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
/*
* Anon pages need migration entries to preserve them, but file
@@ -2368,7 +2394,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
{
int i = 0;
- /* If unmap_page() uses try_to_migrate() on file, remove this check */
+ /* If unmap_folio() uses try_to_migrate() on file, remove this check */
if (!folio_test_anon(folio))
return;
for (;;) {
@@ -2418,7 +2444,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* for example lock_page() which set PG_waiters.
*
* Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_page() and is stored in
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
* the migration entry instead from where remap_page() will restore it.
* We can still have PG_anon_exclusive set on effectively unmapped and
* unreferenced sub-pages of an anonymous THP: we can simply drop
@@ -2435,17 +2461,32 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
(1L << PG_arch_2) |
+ (1L << PG_arch_3) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first tail page is compound_mapcount */
+ /* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
- page_tail->private = 0;
+
+ /*
+ * page->private should not be set in tail pages with the exception
+ * of swap cache pages that store the swp_entry_t in tail pages.
+ * Fix up and warn once if private is unexpectedly set.
+ *
+ * What of 32-bit systems, on which head[1].compound_pincount overlays
+ * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and
+ * compound_pincount must be 0 for folio_ref_freeze() to have succeeded.
+ */
+ if (!folio_test_swapcache(page_folio(head))) {
+ VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
+ page_tail->private = 0;
+ }
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2611,27 +2652,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
- struct deferred_split *ds_queue = get_deferred_split_queue(head);
- XA_STATE(xas, &head->mapping->i_pages, head->index);
+ struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
- VM_BUG_ON_PAGE(!PageLocked(head), head);
- VM_BUG_ON_PAGE(!PageCompound(head), head);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- is_hzp = is_huge_zero_page(head);
- VM_WARN_ON_ONCE_PAGE(is_hzp, head);
+ is_hzp = is_huge_zero_page(&folio->page);
+ VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
if (is_hzp)
return -EBUSY;
- if (PageWriteback(head))
+ if (folio_test_writeback(folio))
return -EBUSY;
- if (PageAnon(head)) {
+ if (folio_test_anon(folio)) {
/*
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
@@ -2640,7 +2680,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* is taken to serialise against parallel split or collapse
* operations.
*/
- anon_vma = page_get_anon_vma(head);
+ anon_vma = folio_get_anon_vma(folio);
if (!anon_vma) {
ret = -EBUSY;
goto out;
@@ -2649,7 +2689,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
- mapping = head->mapping;
+ gfp_t gfp;
+
+ mapping = folio->mapping;
/* Truncated ? */
if (!mapping) {
@@ -2657,8 +2699,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
- xas_split_alloc(&xas, head, compound_order(head),
- mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
+ gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+ GFP_RECLAIM_MASK);
+
+ if (folio_test_private(folio) &&
+ !filemap_release_folio(folio, gfp)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
@@ -2672,7 +2722,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
* which cannot be nested inside the page tree lock. So note
* end now: i_size itself may be changed at any moment, but
- * head page lock is good enough to serialize the trimming.
+ * folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -2680,46 +2730,46 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
/*
- * Racy check if we can split the page, before unmap_page() will
+ * Racy check if we can split the page, before unmap_folio() will
* split PMDs
*/
if (!can_split_folio(folio, &extra_pins)) {
- ret = -EBUSY;
+ ret = -EAGAIN;
goto out_unlock;
}
- unmap_page(head);
+ unmap_folio(folio);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
/*
- * Check if the head page is present in page cache.
- * We assume all tail are present too, if head is there.
+ * Check if the folio is present in page cache.
+ * We assume all tail are present too, if folio is there.
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != head)
+ if (xas_load(&xas) != folio)
goto fail;
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- if (page_ref_freeze(head, 1 + extra_pins)) {
- if (!list_empty(page_deferred_list(head))) {
+ if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ if (!list_empty(page_deferred_list(&folio->page))) {
ds_queue->split_queue_len--;
- list_del(page_deferred_list(head));
+ list_del(page_deferred_list(&folio->page));
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- int nr = thp_nr_pages(head);
+ int nr = folio_nr_pages(folio);
- xas_split(&xas, head, thp_order(head));
- if (PageSwapBacked(head)) {
- __mod_lruvec_page_state(head, NR_SHMEM_THPS,
+ xas_split(&xas, folio, folio_order(folio));
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
-nr);
} else {
- __mod_lruvec_page_state(head, NR_FILE_THPS,
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS,
-nr);
filemap_nr_thps_dec(mapping);
}
@@ -2734,7 +2784,7 @@ fail:
xas_unlock(&xas);
local_irq_enable();
remap_page(folio, folio_nr_pages(folio));
- ret = -EBUSY;
+ ret = -EAGAIN;
}
out_unlock:
@@ -2894,11 +2944,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
@@ -2985,7 +3033,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
/* FOLL_DUMP to ignore special (like zero) pages */
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+ if (IS_ERR_OR_NULL(page))
continue;
if (!is_transparent_hugepage(page))
@@ -3040,28 +3088,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct page *fpage = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index,
+ FGP_ENTRY, 0);
nr_pages = 1;
- if (xa_is_value(fpage) || !fpage)
+ if (xa_is_value(folio) || !folio)
continue;
- if (!is_transparent_hugepage(fpage))
+ if (!folio_test_large(folio))
goto next;
total++;
- nr_pages = thp_nr_pages(fpage);
+ nr_pages = folio_nr_pages(folio);
- if (!trylock_page(fpage))
+ if (!folio_trylock(folio))
goto next;
- if (!split_huge_page(fpage))
+ if (!split_folio(folio))
split++;
- unlock_page(fpage);
+ folio_unlock(folio);
next:
- put_page(fpage);
+ folio_put(folio);
cond_resched();
}
@@ -3177,6 +3225,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
@@ -3191,6 +3240,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
+ if (pmd_young(pmdval))
+ entry = make_migration_entry_young(entry);
+ if (pmd_dirty(pmdval))
+ entry = make_migration_entry_dirty(entry);
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3216,13 +3269,18 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
- pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
+ pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ if (!is_migration_entry_young(entry))
+ pmde = pmd_mkold(pmde);
+ /* NOTE: this may contain setting soft-dirty on some archs */
+ if (PageDirty(new) && is_migration_entry_dirty(entry))
+ pmde = pmd_mkdirty(pmde);
if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e070b8593b37..db895230ee7e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -33,6 +33,7 @@
#include <linux/migrate.h>
#include <linux/nospec.h>
#include <linux/delayacct.h>
+#include <linux/memory.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -53,13 +54,13 @@ struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-static bool hugetlb_cma_page(struct page *page, unsigned int order)
+static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
{
- return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
+ return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
1 << order);
}
#else
-static bool hugetlb_cma_page(struct page *page, unsigned int order)
+static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
{
return false;
}
@@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -251,13 +255,159 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
return subpool_inode(file_inode(vma->vm_file));
}
+/*
+ * hugetlb vma_lock helper routines
+ */
+static bool __vma_shareable_lock(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_write(&vma_lock->rw_sema);
+ }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (!__vma_shareable_lock(vma))
+ return 1;
+
+ return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ lockdep_assert_held(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+ struct hugetlb_vma_lock *vma_lock = container_of(kref,
+ struct hugetlb_vma_lock, refs);
+
+ kfree(vma_lock);
+}
+
+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+{
+ struct vm_area_struct *vma = vma_lock->vma;
+
+ /*
+ * vma_lock structure may or not be released as a result of put,
+ * it certainly will no longer be attached to vma so clear pointer.
+ * Semaphore synchronizes access to vma_lock->vma field.
+ */
+ vma_lock->vma = NULL;
+ vma->vm_private_data = NULL;
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+ /*
+ * Only present in sharable vmas.
+ */
+ if (!vma || !__vma_shareable_lock(vma))
+ return;
+
+ if (vma->vm_private_data) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ /* Should never get here with non-NULL vm_private_data */
+ if (vma->vm_private_data)
+ return;
+
+ vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+ if (!vma_lock) {
+ /*
+ * If we can not allocate structure, then vma can not
+ * participate in pmd sharing. This is only a possible
+ * performance enhancement and memory saving issue.
+ * However, the lock is also used to synchronize page
+ * faults with truncation. If the lock is not present,
+ * unlikely races could leave pages in a file past i_size
+ * until the file is removed. Warn in the unlikely case of
+ * allocation failure.
+ */
+ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
+ return;
+ }
+
+ kref_init(&vma_lock->refs);
+ init_rwsem(&vma_lock->rw_sema);
+ vma_lock->vma = vma;
+ vma->vm_private_data = vma_lock;
+}
+
/* Helper that removes a struct file_region from the resv_map cache and returns
* it for use.
*/
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
- struct file_region *nrg = NULL;
+ struct file_region *nrg;
VM_BUG_ON(resv->region_cache_count <= 0);
@@ -339,7 +489,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
- struct file_region *nrg = NULL, *prg = NULL;
+ struct file_region *nrg, *prg;
prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
@@ -456,14 +606,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
- struct list_head allocated_regions;
+ LIST_HEAD(allocated_regions);
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
- INIT_LIST_HEAD(&allocated_regions);
-
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
@@ -860,7 +1008,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
@@ -1007,12 +1155,28 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- if (!(vma->vm_flags & VM_MAYSHARE))
- vma->vm_private_data = (void *)0;
+ /*
+ * Clear vm_private_data
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated in a subsequent call to hugetlb_vm_op_open.
+ * Before clearing, make sure pointer is not associated with vma
+ * as this will leak the structure. This is the case when called
+ * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
+ * been called to allocate a new structure.
+ * - For MAP_PRIVATE mappings, this is the reserve map which does
+ * not apply to children. Faults generated by the children are
+ * not guaranteed to succeed, even if read-only.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock && vma_lock->vma != vma)
+ vma->vm_private_data = NULL;
+ } else
+ vma->vm_private_data = NULL;
}
/*
@@ -1043,7 +1207,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
kref_put(&reservations->refs, resv_map_release);
}
- reset_vma_resv_huge_pages(vma);
+ hugetlb_dup_vma_private(vma);
}
/* Returns true if the VMA has associated reserve pages */
@@ -1109,17 +1273,17 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
return false;
}
-static void enqueue_huge_page(struct hstate *h, struct page *page)
+static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
lockdep_assert_held(&hugetlb_lock);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
- list_move(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&folio->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
- SetHPageFreed(page);
+ folio_set_hugetlb_freed(folio);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1182,6 +1346,11 @@ retry_cpuset:
return NULL;
}
+static unsigned long available_huge_pages(struct hstate *h)
+{
+ return h->free_huge_pages - h->resv_huge_pages;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1198,12 +1367,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma, chg) &&
- h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ if (avoid_reserve && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1303,75 +1471,76 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
nr_nodes--)
/* used to demote non-gigantic_huge pages as well */
-static void __destroy_compound_gigantic_page(struct page *page,
+static void __destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
- atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(folio_mapcount_ptr(folio), 0);
+ atomic_set(folio_subpages_mapcount_ptr(folio), 0);
+ atomic_set(folio_pincount_ptr(folio), 0);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ for (i = 1; i < nr_pages; i++) {
+ p = folio_page(folio, i);
p->mapping = NULL;
clear_compound_head(p);
if (!demote)
set_page_refcounted(p);
}
- set_compound_order(page, 0);
-#ifdef CONFIG_64BIT
- page[1].compound_nr = 0;
-#endif
- __ClearPageHead(page);
+ folio_set_compound_order(folio, 0);
+ __folio_clear_head(folio);
}
-static void destroy_compound_hugetlb_page_for_demote(struct page *page,
+static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
unsigned int order)
{
- __destroy_compound_gigantic_page(page, order, true);
+ __destroy_compound_gigantic_folio(folio, order, true);
}
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
+static void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order)
{
- __destroy_compound_gigantic_page(page, order, false);
+ __destroy_compound_gigantic_folio(folio, order, false);
}
-static void free_gigantic_page(struct page *page, unsigned int order)
+static void free_gigantic_folio(struct folio *folio, unsigned int order)
{
/*
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
#ifdef CONFIG_CMA
- if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ int nid = folio_nid(folio);
+
+ if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
return;
#endif
- free_contig_range(page_to_pfn(page), 1 << order);
+ free_contig_range(folio_pfn(folio), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
+ struct page *page;
unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
#ifdef CONFIG_CMA
{
- struct page *page;
int node;
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
if (!(gfp_mask & __GFP_THISNODE)) {
@@ -1382,17 +1551,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
}
}
#endif
- return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ return page ? page_folio(page) : NULL;
}
#else /* !CONFIG_CONTIG_ALLOC */
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
@@ -1400,40 +1570,41 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
-static inline void free_gigantic_page(struct page *page, unsigned int order) { }
-static inline void destroy_compound_gigantic_page(struct page *page,
+static inline void free_gigantic_folio(struct folio *folio,
+ unsigned int order) { }
+static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
/*
- * Remove hugetlb page from lists, and update dtor so that page appears
+ * Remove hugetlb folio from lists, and update dtor so that the folio appears
* as just a compound page.
*
- * A reference is held on the page, except in the case of demote.
+ * A reference is held on the folio, except in the case of demote.
*
* Must be called with hugetlb lock held.
*/
-static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus,
bool demote)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- list_del(&page->lru);
+ list_del(&folio->lru);
- if (HPageFreed(page)) {
+ if (folio_test_hugetlb_freed(folio)) {
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
}
@@ -1452,50 +1623,50 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page,
*
* For gigantic pages set the destructor to the null dtor. This
* destructor will never be called. Before freeing the gigantic
- * page destroy_compound_gigantic_page will turn the compound page
- * into a simple group of pages. After this the destructor does not
+ * page destroy_compound_gigantic_folio will turn the folio into a
+ * simple group of pages. After this the destructor does not
* apply.
*
* This handles the case where more than one ref is held when and
- * after update_and_free_page is called.
+ * after update_and_free_hugetlb_folio is called.
*
* In the case of demote we do not ref count the page as it will soon
* be turned into a page of smaller size.
*/
if (!demote)
- set_page_refcounted(page);
+ folio_ref_unfreeze(folio, 1);
if (hstate_is_gigantic(h))
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
else
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}
-static void remove_hugetlb_page(struct hstate *h, struct page *page,
+static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- __remove_hugetlb_page(h, page, adjust_surplus, false);
+ __remove_hugetlb_folio(h, folio, adjust_surplus, false);
}
-static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- __remove_hugetlb_page(h, page, adjust_surplus, true);
+ __remove_hugetlb_folio(h, folio, adjust_surplus, true);
}
-static void add_hugetlb_page(struct hstate *h, struct page *page,
+static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
int zeroed;
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
lockdep_assert_held(&hugetlb_lock);
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&folio->lru);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
@@ -1504,17 +1675,21 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
h->surplus_huge_pages_node[nid]++;
}
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_page_private(page, 0);
- SetHPageVmemmapOptimized(page);
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ folio_change_private(folio, NULL);
+ /*
+ * We have to set hugetlb_vmemmap_optimized again as above
+ * folio_change_private(folio, NULL) cleared it.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
/*
- * This page is about to be managed by the hugetlb allocator and
+ * This folio is about to be managed by the hugetlb allocator and
* should have no users. Drop our reference, and check for others
* just in case.
*/
- zeroed = put_page_testzero(page);
- if (!zeroed)
+ zeroed = folio_put_testzero(folio);
+ if (unlikely(!zeroed))
/*
* It is VERY unlikely soneone else has taken a ref on
* the page. In this case, we simply return as the
@@ -1523,14 +1698,15 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
*/
return;
- arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ arch_clear_hugepage_flags(&folio->page);
+ enqueue_hugetlb_folio(h, folio);
}
static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- struct page *subpage = page;
+ struct folio *folio = page_folio(page);
+ struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
@@ -1539,7 +1715,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* If we don't know which subpages are hwpoisoned, we can't free
* the hugepage, so it's leaked intentionally.
*/
- if (HPageRawHwpUnreliable(page))
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
if (hugetlb_vmemmap_restore(h, page)) {
@@ -1549,7 +1725,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* page and put the page back on the hugetlb free list and treat
* as a surplus page.
*/
- add_hugetlb_page(h, page, true);
+ add_hugetlb_folio(h, folio, true);
spin_unlock_irq(&hugetlb_lock);
return;
}
@@ -1558,11 +1734,11 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* Move PageHWPoison flag from head page to the raw error pages,
* which makes any healthy subpages reusable.
*/
- if (unlikely(PageHWPoison(page)))
- hugetlb_clear_page_hwpoison(page);
+ if (unlikely(folio_test_hwpoison(folio)))
+ hugetlb_clear_page_hwpoison(&folio->page);
- for (i = 0; i < pages_per_huge_page(h);
- i++, subpage = mem_map_next(subpage, page, i)) {
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ subpage = folio_page(folio, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
@@ -1571,19 +1747,19 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
/*
* Non-gigantic pages demoted from CMA allocated gigantic pages
- * need to be given back to CMA in free_gigantic_page.
+ * need to be given back to CMA in free_gigantic_folio.
*/
if (hstate_is_gigantic(h) ||
- hugetlb_cma_page(page, huge_page_order(h))) {
- destroy_compound_gigantic_page(page, huge_page_order(h));
- free_gigantic_page(page, huge_page_order(h));
+ hugetlb_cma_folio(folio, huge_page_order(h))) {
+ destroy_compound_gigantic_folio(folio, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
} else {
__free_pages(page, huge_page_order(h));
}
}
/*
- * As update_and_free_page() can be called under any context, so we cannot
+ * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
* the vmemmap pages.
@@ -1612,8 +1788,9 @@ static void free_hpage_workfn(struct work_struct *work)
/*
* The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
* is going to trigger because a previous call to
- * remove_hugetlb_page() will set_compound_page_dtor(page,
- * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ * remove_hugetlb_folio() will call folio_set_compound_dtor
+ * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
+ * directly.
*/
h = size_to_hstate(page_size(page));
@@ -1630,11 +1807,11 @@ static inline void flush_free_hpage_work(struct hstate *h)
flush_work(&free_hpage_work);
}
-static void update_and_free_page(struct hstate *h, struct page *page,
+static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
bool atomic)
{
- if (!HPageVmemmapOptimized(page) || !atomic) {
- __update_and_free_page(h, page);
+ if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
+ __update_and_free_page(h, &folio->page);
return;
}
@@ -1645,16 +1822,18 @@ static void update_and_free_page(struct hstate *h, struct page *page,
* empty. Otherwise, schedule_work() had been called but the workfn
* hasn't retrieved the list yet.
*/
- if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
schedule_work(&free_hpage_work);
}
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
+ struct folio *folio;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page, false);
+ folio = page_folio(page);
+ update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
}
}
@@ -1676,21 +1855,22 @@ void free_huge_page(struct page *page)
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- struct hugepage_subpool *spool = hugetlb_page_subpool(page);
+ struct folio *folio = page_folio(page);
+ struct hstate *h = folio_hstate(folio);
+ int nid = folio_nid(folio);
+ struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
bool restore_reserve;
unsigned long flags;
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
- hugetlb_set_page_subpool(page, NULL);
- if (PageAnon(page))
- __ClearPageAnonExclusive(page);
- page->mapping = NULL;
- restore_reserve = HPageRestoreReserve(page);
- ClearHPageRestoreReserve(page);
+ hugetlb_set_folio_subpool(folio, NULL);
+ if (folio_test_anon(folio))
+ __ClearPageAnonExclusive(&folio->page);
+ folio->mapping = NULL;
+ restore_reserve = folio_test_hugetlb_restore_reserve(folio);
+ folio_clear_hugetlb_restore_reserve(folio);
/*
* If HPageRestoreReserve was set on page, page allocation consumed a
@@ -1712,26 +1892,26 @@ void free_huge_page(struct page *page)
}
spin_lock_irqsave(&hugetlb_lock, flags);
- ClearHPageMigratable(page);
- hugetlb_cgroup_uncharge_page(hstate_index(h),
- pages_per_huge_page(h), page);
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ folio_clear_hugetlb_migratable(folio);
+ hugetlb_cgroup_uncharge_folio(hstate_index(h),
+ pages_per_huge_page(h), folio);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
if (restore_reserve)
h->resv_huge_pages++;
- if (HPageTemporary(page)) {
- remove_hugetlb_page(h, page, false);
+ if (folio_test_hugetlb_temporary(folio)) {
+ remove_hugetlb_folio(h, folio, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page, true);
+ update_and_free_hugetlb_folio(h, folio, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
- remove_hugetlb_page(h, page, true);
+ remove_hugetlb_folio(h, folio, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page, true);
+ update_and_free_hugetlb_folio(h, folio, true);
} else {
arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
}
@@ -1746,36 +1926,38 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_huge_page(struct hstate *h, struct page *page)
+static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- hugetlb_vmemmap_optimize(h, page);
- INIT_LIST_HEAD(&page->lru);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- hugetlb_set_page_subpool(page, NULL);
- set_hugetlb_cgroup(page, NULL);
- set_hugetlb_cgroup_rsvd(page, NULL);
+ hugetlb_vmemmap_optimize(h, &folio->page);
+ INIT_LIST_HEAD(&folio->lru);
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ hugetlb_set_folio_subpool(folio, NULL);
+ set_hugetlb_cgroup(folio, NULL);
+ set_hugetlb_cgroup_rsvd(folio, NULL);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
- __prep_new_huge_page(h, page);
+ __prep_new_hugetlb_folio(h, folio);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
-static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
- bool demote)
+static bool __prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order, bool demote)
{
int i, j;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
+
+ __folio_clear_reserved(folio);
+ __folio_set_head(folio);
+ /* we rely on prep_new_hugetlb_folio to set the destructor */
+ folio_set_compound_order(folio, order);
+ for (i = 0; i < nr_pages; i++) {
+ p = folio_page(folio, i);
- /* we rely on prep_new_huge_page to set the destructor */
- set_compound_order(page, order);
- __ClearPageReserved(page);
- __SetPageHead(page);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
@@ -1788,7 +1970,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
* on the head page when they need know if put_page() is needed
* after get_user_pages().
*/
- __ClearPageReserved(p);
+ if (i != 0) /* head page cleared above */
+ __ClearPageReserved(p);
/*
* Subtle and very unlikely
*
@@ -1814,39 +1997,42 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
} else {
VM_BUG_ON_PAGE(page_count(p), p);
}
- set_compound_head(p, page);
+ if (i != 0)
+ set_compound_head(p, &folio->page);
}
- atomic_set(compound_mapcount_ptr(page), -1);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(folio_mapcount_ptr(folio), -1);
+ atomic_set(folio_subpages_mapcount_ptr(folio), 0);
+ atomic_set(folio_pincount_ptr(folio), 0);
return true;
out_error:
- /* undo tail page modifications made above */
- p = page + 1;
- for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
- clear_compound_head(p);
+ /* undo page modifications made above */
+ for (j = 0; j < i; j++) {
+ p = folio_page(folio, j);
+ if (j != 0)
+ clear_compound_head(p);
set_page_refcounted(p);
}
/* need to clear PG_reserved on remaining tail pages */
- for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ for (; j < nr_pages; j++) {
+ p = folio_page(folio, j);
__ClearPageReserved(p);
- set_compound_order(page, 0);
-#ifdef CONFIG_64BIT
- page[1].compound_nr = 0;
-#endif
- __ClearPageHead(page);
+ }
+ folio_set_compound_order(folio, 0);
+ __folio_clear_head(folio);
return false;
}
-static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order)
{
- return __prep_compound_gigantic_page(page, order, false);
+ return __prep_compound_gigantic_folio(folio, order, false);
}
-static bool prep_compound_gigantic_page_for_demote(struct page *page,
+static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
unsigned int order)
{
- return __prep_compound_gigantic_page(page, order, true);
+ return __prep_compound_gigantic_folio(folio, order, true);
}
/*
@@ -1911,13 +2097,14 @@ pgoff_t hugetlb_basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
bool alloc_try_hard = true;
+ bool retry = true;
/*
* By default we always try hard to allocate the page with
@@ -1933,11 +2120,20 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
+retry:
page = __alloc_pages(gfp_mask, order, nid, nmask);
- if (page)
- __count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ /* Freeze head page */
+ if (page && !page_ref_freeze(page, 1)) {
+ __free_pages(page, order);
+ if (retry) { /* retry once */
+ retry = false;
+ goto retry;
+ }
+ /* WOW! twice in a row. */
+ pr_warn("HugeTLB head page unexpected inflated ref count\n");
+ page = NULL;
+ }
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
@@ -1955,36 +2151,44 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
if (node_alloc_noretry && !page && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
- return page;
+ if (!page) {
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ return NULL;
+ }
+
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ return page_folio(page);
}
/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen': ref count of head page and all tail
+ * pages is zero.
*/
-static struct page *alloc_fresh_huge_page(struct hstate *h,
+static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
bool retry = false;
retry:
if (hstate_is_gigantic(h))
- page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
else
- page = alloc_buddy_huge_page(h, gfp_mask,
+ folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
nid, nmask, node_alloc_noretry);
- if (!page)
+ if (!folio)
return NULL;
-
if (hstate_is_gigantic(h)) {
- if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
/*
* Rare failure to convert pages to compound page.
* Free pages and try again - ONCE!
*/
- free_gigantic_page(page, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
if (!retry) {
retry = true;
goto retry;
@@ -1992,9 +2196,9 @@ retry:
return NULL;
}
}
- prep_new_huge_page(h, page, page_to_nid(page));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- return page;
+ return folio;
}
/*
@@ -2004,23 +2208,20 @@ retry:
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
- node_alloc_noretry);
- if (page)
- break;
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
+ nodes_allowed, node_alloc_noretry);
+ if (folio) {
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ return 1;
+ }
}
- if (!page)
- return 0;
-
- put_page(page); /* free it into the hugepage allocator */
-
- return 1;
+ return 0;
}
/*
@@ -2036,6 +2237,7 @@ static struct page *remove_pool_huge_page(struct hstate *h,
{
int nr_nodes, node;
struct page *page = NULL;
+ struct folio *folio;
lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
@@ -2047,7 +2249,8 @@ static struct page *remove_pool_huge_page(struct hstate *h,
!list_empty(&h->hugepage_freelists[node])) {
page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
- remove_hugetlb_page(h, page, acct_surplus);
+ folio = page_folio(page);
+ remove_hugetlb_folio(h, folio, acct_surplus);
break;
}
}
@@ -2072,29 +2275,29 @@ static struct page *remove_pool_huge_page(struct hstate *h,
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+ struct folio *folio = page_folio(page);
retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
- if (!PageHuge(page))
+ if (!folio_test_hugetlb(folio))
return 0;
spin_lock_irq(&hugetlb_lock);
- if (!PageHuge(page)) {
+ if (!folio_test_hugetlb(folio)) {
rc = 0;
goto out;
}
- if (!page_count(page)) {
- struct page *head = compound_head(page);
- struct hstate *h = page_hstate(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!folio_ref_count(folio)) {
+ struct hstate *h = folio_hstate(folio);
+ if (!available_huge_pages(h))
goto out;
/*
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
- if (unlikely(!HPageFreed(head))) {
+ if (unlikely(!folio_test_hugetlb_freed(folio))) {
spin_unlock_irq(&hugetlb_lock);
cond_resched();
@@ -2109,24 +2312,24 @@ retry:
goto retry;
}
- remove_hugetlb_page(h, head, false);
+ remove_hugetlb_folio(h, folio, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
/*
- * Normally update_and_free_page will allocate required vmemmmap
- * before freeing the page. update_and_free_page will fail to
+ * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
+ * before freeing the page. update_and_free_hugtlb_folio will fail to
* free the page if it can not allocate required vmemmap. We
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
- rc = hugetlb_vmemmap_restore(h, head);
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
if (!rc) {
- update_and_free_page(h, head, false);
+ update_and_free_hugetlb_folio(h, folio, false);
} else {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_page(h, head, false);
+ add_hugetlb_folio(h, folio, false);
h->max_huge_pages++;
spin_unlock_irq(&hugetlb_lock);
}
@@ -2175,10 +2378,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask, bool zero_ref)
+ int nid, nodemask_t *nmask)
{
- struct page *page = NULL;
- bool retry = false;
+ struct folio *folio = NULL;
if (hstate_is_gigantic(h))
return NULL;
@@ -2188,9 +2390,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
-retry:
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
spin_lock_irq(&hugetlb_lock);
@@ -2202,64 +2403,42 @@ retry:
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- SetHPageTemporary(page);
+ folio_set_hugetlb_temporary(folio);
spin_unlock_irq(&hugetlb_lock);
- put_page(page);
+ free_huge_page(&folio->page);
return NULL;
}
- if (zero_ref) {
- /*
- * Caller requires a page with zero ref count.
- * We will drop ref count here. If someone else is holding
- * a ref, the page will be freed when they drop it. Abuse
- * temporary page flag to accomplish this.
- */
- SetHPageTemporary(page);
- if (!put_page_testzero(page)) {
- /*
- * Unexpected inflated ref count on freshly allocated
- * huge. Retry once.
- */
- pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
- spin_unlock_irq(&hugetlb_lock);
- if (retry)
- return NULL;
-
- retry = true;
- goto retry;
- }
- ClearHPageTemporary(page);
- }
-
h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
+ h->surplus_huge_pages_node[folio_nid(folio)]++;
out_unlock:
spin_unlock_irq(&hugetlb_lock);
- return page;
+ return &folio->page;
}
static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
+ struct folio *folio;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
+ /* fresh huge pages are frozen */
+ folio_ref_unfreeze(folio, 1);
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- SetHPageTemporary(page);
+ folio_set_hugetlb_temporary(folio);
- return page;
+ return &folio->page;
}
/*
@@ -2280,14 +2459,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
gfp_t gfp = gfp_mask | __GFP_NOWARN;
gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+ page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
if (!page)
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
return page;
}
@@ -2297,7 +2476,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ if (available_huge_pages(h)) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
@@ -2336,7 +2515,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
- struct list_head surplus_list;
+ LIST_HEAD(surplus_list);
struct page *page, *tmp;
int ret;
long i;
@@ -2351,14 +2530,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
}
allocated = 0;
- INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
- NUMA_NO_NODE, NULL, true);
+ NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
@@ -2402,7 +2580,7 @@ retry:
if ((--needed) < 0)
break;
/* Add the page to the hugetlb allocator */
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, page_folio(page));
}
free:
spin_unlock_irq(&hugetlb_lock);
@@ -2709,73 +2887,52 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
}
/*
- * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
+ * the old one
* @h: struct hstate old page belongs to
- * @old_page: Old page to dissolve
+ * @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
- struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
+ struct folio *old_folio, struct list_head *list)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- int nid = page_to_nid(old_page);
- bool alloc_retry = false;
- struct page *new_page;
+ int nid = folio_nid(old_folio);
+ struct folio *new_folio;
int ret = 0;
/*
- * Before dissolving the page, we need to allocate a new one for the
- * pool to remain stable. Here, we allocate the page and 'prep' it
+ * Before dissolving the folio, we need to allocate a new one for the
+ * pool to remain stable. Here, we allocate the folio and 'prep' it
* by doing everything but actually updating counters and adding to
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
-alloc_retry:
- new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
- if (!new_page)
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
+ if (!new_folio)
return -ENOMEM;
- /*
- * If all goes well, this page will be directly added to the free
- * list in the pool. For this the ref count needs to be zero.
- * Attempt to drop now, and retry once if needed. It is VERY
- * unlikely there is another ref on the page.
- *
- * If someone else has a reference to the page, it will be freed
- * when they drop their ref. Abuse temporary page flag to accomplish
- * this. Retry once if there is an inflated ref count.
- */
- SetHPageTemporary(new_page);
- if (!put_page_testzero(new_page)) {
- if (alloc_retry)
- return -EBUSY;
-
- alloc_retry = true;
- goto alloc_retry;
- }
- ClearHPageTemporary(new_page);
-
- __prep_new_huge_page(h, new_page);
+ __prep_new_hugetlb_folio(h, new_folio);
retry:
spin_lock_irq(&hugetlb_lock);
- if (!PageHuge(old_page)) {
+ if (!folio_test_hugetlb(old_folio)) {
/*
- * Freed from under us. Drop new_page too.
+ * Freed from under us. Drop new_folio too.
*/
goto free_new;
- } else if (page_count(old_page)) {
+ } else if (folio_ref_count(old_folio)) {
/*
- * Someone has grabbed the page, try to isolate it here.
+ * Someone has grabbed the folio, try to isolate it here.
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- ret = isolate_hugetlb(old_page, list);
+ ret = isolate_hugetlb(&old_folio->page, list);
spin_lock_irq(&hugetlb_lock);
goto free_new;
- } else if (!HPageFreed(old_page)) {
+ } else if (!folio_test_hugetlb_freed(old_folio)) {
/*
- * Page's refcount is 0 but it has not been enqueued in the
+ * Folio's refcount is 0 but it has not been enqueued in the
* freelist yet. Race window is small, so we can succeed here if
* we retry.
*/
@@ -2784,35 +2941,35 @@ retry:
goto retry;
} else {
/*
- * Ok, old_page is still a genuine free hugepage. Remove it from
+ * Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
* incremented again when calling __prep_account_new_huge_page()
- * and enqueue_huge_page() for new_page. The counters will remain
- * stable since this happens under the lock.
+ * and enqueue_hugetlb_folio() for new_folio. The counters will
+ * remain stable since this happens under the lock.
*/
- remove_hugetlb_page(h, old_page, false);
+ remove_hugetlb_folio(h, old_folio, false);
/*
- * Ref count on new page is already zero as it was dropped
+ * Ref count on new_folio is already zero as it was dropped
* earlier. It can be directly added to the pool free list.
*/
__prep_account_new_huge_page(h, nid);
- enqueue_huge_page(h, new_page);
+ enqueue_hugetlb_folio(h, new_folio);
/*
- * Pages have been replaced, we can safely free the old one.
+ * Folio has been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page, false);
+ update_and_free_hugetlb_folio(h, old_folio, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
- /* Page has a zero ref count, but needs a ref to be freed */
- set_page_refcounted(new_page);
- update_and_free_page(h, new_page, false);
+ /* Folio has a zero ref count, but needs a ref to be freed */
+ folio_ref_unfreeze(new_folio, 1);
+ update_and_free_hugetlb_folio(h, new_folio, false);
return ret;
}
@@ -2820,7 +2977,7 @@ free_new:
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
{
struct hstate *h;
- struct page *head;
+ struct folio *folio = page_folio(page);
int ret = -EBUSY;
/*
@@ -2829,9 +2986,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
* Return success when racing as if we dissolved the page ourselves.
*/
spin_lock_irq(&hugetlb_lock);
- if (PageHuge(page)) {
- head = compound_head(page);
- h = page_hstate(head);
+ if (folio_test_hugetlb(folio)) {
+ h = folio_hstate(folio);
} else {
spin_unlock_irq(&hugetlb_lock);
return 0;
@@ -2846,10 +3002,10 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (page_count(head) && !isolate_hugetlb(head, list))
+ if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
ret = 0;
- else if (!page_count(head))
- ret = alloc_and_dissolve_huge_page(h, head, list);
+ else if (!folio_ref_count(folio))
+ ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
return ret;
}
@@ -2860,6 +3016,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
+ struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
@@ -2928,14 +3085,16 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
+ spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
- spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
/* Fall through */
}
+ folio = page_folio(page);
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
@@ -2965,8 +3124,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
if (deferred_reserve)
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
}
return page;
@@ -3031,17 +3190,18 @@ static void __init gather_bootmem_prealloc(void)
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
+ struct folio *folio = page_folio(page);
struct hstate *h = m->hstate;
VM_BUG_ON(!hstate_is_gigantic(h));
- WARN_ON(page_count(page) != 1);
- if (prep_compound_gigantic_page(page, huge_page_order(h))) {
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* add to the hugepage allocator */
+ WARN_ON(folio_ref_count(folio) != 1);
+ if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ WARN_ON(folio_test_reserved(folio));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+ free_huge_page(page); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
- free_gigantic_page(page, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
}
/*
@@ -3063,14 +3223,14 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
if (!alloc_bootmem_huge_page(h, nid))
break;
} else {
- struct page *page;
+ struct folio *folio;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- page = alloc_fresh_huge_page(h, gfp_mask, nid,
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
&node_states[N_MEMORY], NULL);
- if (!page)
+ if (!folio)
break;
- put_page(page); /* free it into the hugepage allocator */
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
}
cond_resched();
}
@@ -3215,7 +3375,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
goto out;
if (PageHighMem(page))
continue;
- remove_hugetlb_page(h, page, false);
+ remove_hugetlb_folio(h, page_folio(page), false);
list_add(&page->lru, &page_list);
}
}
@@ -3420,11 +3580,13 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
{
int i, nid = page_to_nid(page);
struct hstate *target_hstate;
+ struct folio *folio = page_folio(page);
+ struct page *subpage;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
- remove_hugetlb_page_for_demote(h, page, false);
+ remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
rc = hugetlb_vmemmap_restore(h, page);
@@ -3432,15 +3594,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
/* Allocation of vmemmmap failed, we can not demote page */
spin_lock_irq(&hugetlb_lock);
set_page_refcounted(page);
- add_hugetlb_page(h, page, false);
+ add_hugetlb_folio(h, page_folio(page), false);
return rc;
}
/*
- * Use destroy_compound_hugetlb_page_for_demote for all huge page
+ * Use destroy_compound_hugetlb_folio_for_demote for all huge page
* sizes as it will not ref count pages.
*/
- destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
+ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
/*
* Taking target hstate mutex synchronizes with set_max_huge_pages.
@@ -3453,15 +3615,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
+ subpage = nth_page(page, i);
+ folio = page_folio(subpage);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_page_for_demote(page + i,
+ prep_compound_gigantic_folio_for_demote(folio,
target_hstate->order);
else
- prep_compound_page(page + i, target_hstate->order);
- set_page_private(page + i, 0);
- set_page_refcounted(page + i);
- prep_new_huge_page(target_hstate, page + i, nid);
- put_page(page + i);
+ prep_compound_page(subpage, target_hstate->order);
+ set_page_private(subpage, 0);
+ prep_new_hugetlb_folio(target_hstate, folio, nid);
+ free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -3472,7 +3635,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
* based on pool changes for the demoted page.
*/
h->max_huge_pages--;
- target_hstate->max_huge_pages += pages_per_huge_page(h);
+ target_hstate->max_huge_pages +=
+ pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
return rc;
}
@@ -3714,7 +3878,7 @@ static ssize_t demote_store(struct kobject *kobj,
unsigned long nr_available;
nodemask_t nodes_allowed, *n_mask;
struct hstate *h;
- int err = 0;
+ int err;
int nid;
err = kstrtoul(buf, 10, &nr_demote);
@@ -3765,8 +3929,7 @@ HSTATE_ATTR_WO(demote);
static ssize_t demote_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int nid;
- struct hstate *h = kobj_to_hstate(kobj, &nid);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3779,7 +3942,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
struct hstate *h, *demote_hstate;
unsigned long demote_size;
unsigned int demote_order;
- int nid;
demote_size = (unsigned long)memparse(buf, NULL);
@@ -3791,7 +3953,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
return -EINVAL;
/* demote order must be smaller than hstate order */
- h = kobj_to_hstate(kobj, &nid);
+ h = kobj_to_hstate(kobj, NULL);
if (demote_order >= h->order)
return -EINVAL;
@@ -3845,35 +4007,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
if (retval) {
kobject_put(hstate_kobjs[hi]);
hstate_kobjs[hi] = NULL;
+ return retval;
}
if (h->demote_order) {
- if (sysfs_create_group(hstate_kobjs[hi],
- &hstate_demote_attr_group))
+ retval = sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group);
+ if (retval) {
pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
}
- return retval;
-}
-
-static void __init hugetlb_sysfs_init(void)
-{
- struct hstate *h;
- int err;
-
- hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
- if (!hugepages_kobj)
- return;
-
- for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
- hstate_kobjs, &hstate_attr_group);
- if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
- }
+ return 0;
}
#ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
@@ -3929,7 +4082,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -3939,10 +4092,15 @@ static void hugetlb_unregister_node(struct node *node)
for_each_hstate(h) {
int idx = hstate_index(h);
- if (nhs->hstate_kobjs[idx]) {
- kobject_put(nhs->hstate_kobjs[idx]);
- nhs->hstate_kobjs[idx] = NULL;
- }
+ struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+ if (!hstate_kobj)
+ continue;
+ if (h->demote_order)
+ sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+ sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+ kobject_put(hstate_kobj);
+ nhs->hstate_kobjs[idx] = NULL;
}
kobject_put(nhs->hugepages_kobj);
@@ -3954,12 +4112,15 @@ static void hugetlb_unregister_node(struct node *node)
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
+ if (!hugetlb_sysfs_initialized)
+ return;
+
if (nhs->hugepages_kobj)
return; /* already allocated */
@@ -3990,18 +4151,8 @@ static void __init hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_MEMORY) {
- struct node *node = node_devices[nid];
- if (node->dev.id == nid)
- hugetlb_register_node(node);
- }
-
- /*
- * Let the node device driver know we're here so it can
- * [un]register hstate attributes on node hotplug.
- */
- register_hugetlbfs_with_node(hugetlb_register_node,
- hugetlb_unregister_node);
+ for_each_online_node(nid)
+ hugetlb_register_node(node_devices[nid]);
}
#else /* !CONFIG_NUMA */
@@ -4017,6 +4168,36 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ }
+
+#ifdef CONFIG_NUMA
+ hugetlb_sysfs_initialized = true;
+#endif
+ hugetlb_register_all_nodes();
+}
+
static int __init hugetlb_init(void)
{
int i;
@@ -4071,7 +4252,6 @@ static int __init hugetlb_init(void)
report_hugepages();
hugetlb_sysfs_init();
- hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
#ifdef CONFIG_SMP
@@ -4116,7 +4296,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
- huge_page_size(h)/1024);
+ huge_page_size(h)/SZ_1K);
parsed_hstate = h;
}
@@ -4131,11 +4311,11 @@ static void __init hugepages_clear_pages_in_node(void)
if (!hugetlb_max_hstate) {
default_hstate_max_huge_pages = 0;
memset(default_hugepages_in_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(default_hugepages_in_node));
} else {
parsed_hstate->max_huge_pages = 0;
memset(parsed_hstate->max_huge_pages_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(parsed_hstate->max_huge_pages_node));
}
}
@@ -4330,18 +4510,34 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol = get_task_policy(current);
+
+ /*
+ * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+ * (from policy_nodemask) specifically for hugetlb case
+ */
+ if (mpol->mode == MPOL_BIND &&
+ (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+ return &mpol->nodes;
+#endif
+ return NULL;
+}
+
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
- nodemask_t *mpol_allowed;
+ nodemask_t *mbind_nodemask;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
- mpol_allowed = policy_nodemask_current(gfp_mask);
-
+ mbind_nodemask = policy_mbind_nodemask(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed || node_isset(node, *mpol_allowed))
+ if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
nr += array[node];
}
@@ -4570,6 +4766,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
/*
+ * HPAGE_RESV_OWNER indicates a private mapping.
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
@@ -4581,16 +4778,38 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
+
+ /*
+ * vma_lock structure for sharable mappings is vma specific.
+ * Clear old pointer (if copied via vm_area_dup) and allocate
+ * new structure. Before clearing, make sure vma_lock is not
+ * for this vma.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock) {
+ if (vma_lock->vma != vma) {
+ vma->vm_private_data = NULL;
+ hugetlb_vma_lock_alloc(vma);
+ } else
+ pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
+ } else
+ hugetlb_vma_lock_alloc(vma);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ hugetlb_vma_lock_free(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -4713,7 +4932,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
hugepage_add_new_anon_rmap(new_page, vma, addr);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
- ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
@@ -4721,14 +4939,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma)
{
- pte_t *src_pte, *dst_pte, entry, dst_entry;
+ pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
- struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
int ret = 0;
@@ -4742,12 +4959,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
+ * For shared mappings the vma lock must be held before
+ * calling huge_pte_offset in the src vma. Otherwise, the
+ * returned ptep could go away if part of a shared pmd and
+ * another thread calls huge_pmd_unshare.
*/
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(src_vma);
}
last_addr_mask = hugetlb_mask_last_page(h);
@@ -4766,15 +4983,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/*
* If the pagetables are shared don't copy or take references.
- * dst_pte == src_pte is the common case of src/dest sharing.
*
+ * dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
- * another vma. If dst_pte !none, this implies sharing.
- * Check here before taking page table lock, and once again
- * after taking the lock below.
+ * another vma. So page_count of ptep page is checked instead
+ * to reliably determine whether pte is shared.
*/
- dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+ if (page_count(virt_to_page(dst_pte)) > 1) {
addr |= last_addr_mask;
continue;
}
@@ -4783,13 +4998,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- dst_entry = huge_ptep_get(dst_pte);
again:
- if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ if (huge_pte_none(entry)) {
/*
- * Skip if src entry none. Also, skip in the
- * unlikely case dst entry !none as this implies
- * sharing with another vma.
+ * Skip if src entry none.
*/
;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4868,7 +5080,7 @@ again:
restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
- /* dst_entry won't change as in child */
+ /* huge_ptep of dst_pte won't change as in child */
goto again;
}
hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -4900,7 +5112,7 @@ again:
raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
} else {
- i_mmap_unlock_read(mapping);
+ hugetlb_vma_unlock_read(src_vma);
}
return ret;
@@ -4959,6 +5171,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -4990,6 +5203,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
+ hugetlb_vma_unlock_write(vma);
return len + old_addr - old_end;
}
@@ -5006,7 +5220,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5021,13 +5234,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
- /*
- * If sharing possible, alert mmu notifiers of worst case.
- */
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
- end);
- adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
- mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
@@ -5112,7 +5318,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
@@ -5137,29 +5342,46 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /* mmu notification performed in caller */
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
- /*
- * Clear this flag so that x86's huge_pmd_share page_table_shareable
- * test will fail on a vma being torn down, and not grab a page table
- * on its way out. We're lucky that the flag has such an appropriate
- * name, and can in fact be safely cleared here. We could clear it
- * before the __unmap_hugepage_range above, but all that's necessary
- * is to clear it before releasing the i_mmap_rwsem. This works
- * because in the context this is called, the VMA is about to be
- * destroyed and the i_mmap_rwsem is held.
- */
- vma->vm_flags &= ~VM_MAYSHARE;
+ if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
+
__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+
+ mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -5238,9 +5460,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
- VM_BUG_ON(unshare && (flags & FOLL_WRITE));
- VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
-
/*
* hugetlb does not support FOLL_FORCE-style write faults that keep the
* PTE mapped R/O such as maybe_mkwrite() would do.
@@ -5250,8 +5469,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
- if (unlikely(unshare))
- return 0;
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
@@ -5314,11 +5531,10 @@ retry_avoidcopy:
u32 hash;
put_page(old_page);
- BUG_ON(huge_pte_none(pte));
/*
- * Drop hugetlb_fault_mutex and i_mmap_rwsem before
- * unmapping. unmapping needs to hold i_mmap_rwsem
- * in write mode. Dropping i_mmap_rwsem in read mode
+ * Drop hugetlb_fault_mutex and vma_lock before
+ * unmapping. unmapping needs to hold vma_lock
+ * in write mode. Dropping vma_lock in read mode
* here is OK as COW mappings do not interact with
* PMD sharing.
*
@@ -5326,13 +5542,13 @@ retry_avoidcopy:
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
unmap_ref_private(mm, vma, old_page, haddr);
- i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
@@ -5374,8 +5590,6 @@ retry_avoidcopy:
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearHPageRestoreReserve(new_page);
-
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
@@ -5406,19 +5620,6 @@ out_release_old:
return ret;
}
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
-{
- struct address_space *mapping;
- pgoff_t idx;
-
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
- return find_lock_page(mapping, idx);
-}
-
/*
* Return whether there is a pagecache page to back given address within VMA.
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5439,7 +5640,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
return page != NULL;
}
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx)
{
struct folio *folio = page_folio(page);
@@ -5476,7 +5677,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
unsigned long addr,
unsigned long reason)
{
- vm_fault_t ret;
u32 hash;
struct vm_fault vmf = {
.vma = vma,
@@ -5494,18 +5694,31 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * vma_lock and hugetlb_fault_mutex must be dropped before handling
+ * userfault. Also mmap_lock could be dropped due to handling
+ * userfault, any vma operation should be careful from here.
*/
+ hugetlb_vma_unlock_read(vma);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, reason);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ return handle_userfault(&vmf, reason);
+}
- return ret;
+/*
+ * Recheck pte with pgtable lock. Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+ pte_t *ptep, pte_t old_pte)
+{
+ spinlock_t *ptl;
+ bool same;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ same = pte_same(huge_ptep_get(ptep), old_pte);
+ spin_unlock(ptl);
+
+ return same;
}
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -5523,6 +5736,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page, new_pagecache_page = false;
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
* Currently, we are forced to kill the process in the event the
@@ -5533,28 +5747,46 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
- return ret;
+ goto out;
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * i_size is modified when holding i_mmap_rwsem, so check here
- * once for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
-retry:
new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
- ret = hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MISSING);
- goto out;
+ /*
+ * Since hugetlb_no_page() was examining pte
+ * without pgtable lock, we need to re-test under
+ * lock because the pte may not be stable and could
+ * have changed from under us. Try to detect
+ * either changed or during-changing ptes and retry
+ * properly when needed.
+ *
+ * Note that userfaultfd is actually fine with
+ * false positives (e.g. caused by pte changed),
+ * but not wrong logical events (e.g. caused by
+ * reading a pte during changing). The latter can
+ * confuse the userspace, so the strictness is very
+ * much preferred. E.g., MISSING event should
+ * never happen on the page after UFFDIO_COPY has
+ * correctly installed the page and returned.
+ */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MISSING);
}
page = alloc_huge_page(vma, haddr, 0);
@@ -5571,11 +5803,10 @@ retry:
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- ptl = huge_pte_lock(h, mm, ptep);
- ret = 0;
- if (huge_pte_none(huge_ptep_get(ptep)))
+ if (hugetlb_pte_stable(h, mm, ptep, old_pte))
ret = vmf_error(PTR_ERR(page));
- spin_unlock(ptl);
+ else
+ ret = 0;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -5583,11 +5814,17 @@ retry:
new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = huge_add_to_page_cache(page, mapping, idx);
+ int err = hugetlb_add_to_page_cache(page, mapping, idx);
if (err) {
+ /*
+ * err can't be -EEXIST which implies someone
+ * else consumed the reservation since hugetlb
+ * fault mutex is held when add a hugetlb page
+ * to the page cache. So it's safe to call
+ * restore_reserve_on_error() here.
+ */
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
- if (err == -EEXIST)
- goto retry;
goto out;
}
new_pagecache_page = true;
@@ -5615,10 +5852,14 @@ retry:
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
- ret = hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MINOR);
- goto out;
+ /* See comment in userfaultfd_missing() block above */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MINOR);
}
}
@@ -5643,10 +5884,9 @@ retry:
if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
- if (anon_rmap) {
- ClearHPageRestoreReserve(page);
+ if (anon_rmap)
hugepage_add_new_anon_rmap(page, vma, haddr);
- } else
+ else
page_dup_file_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -5676,15 +5916,17 @@ retry:
unlock_page(page);
out:
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return ret;
backout:
spin_unlock(ptl);
backout_unlocked:
- unlock_page(page);
- /* restore reserve for newly allocated pages not in page cache */
if (new_page && !new_pagecache_page)
restore_reserve_on_error(h, vma, haddr, page);
+
+ unlock_page(page);
put_page(page);
goto out;
}
@@ -5745,40 +5987,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with i_size modifications during truncation.
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * Acquire vma lock before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
- i_mmap_unlock_read(mapping);
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
- idx = vma_hugecache_offset(h, vma, haddr);
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
entry = huge_ptep_get(ptep);
/* PTE markers should be handled the same way as none pte */
- if (huge_pte_none_mostly(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ if (huge_pte_none_mostly(entry))
+ /*
+ * hugetlb_no_page will drop vma lock and hugetlb fault
+ * mutex internally, which make us return immediately.
+ */
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
entry, flags);
- goto out_mutex;
- }
ret = 0;
@@ -5808,7 +6051,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+ pagecache_page = find_lock_page(mapping, idx);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -5832,8 +6075,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
return handle_userfault(&vmf, VM_UFFD_WP);
}
@@ -5876,8 +6119,8 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -6005,48 +6248,35 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/*
* Serialization between remove_inode_hugepages() and
- * huge_add_to_page_cache() below happens through the
+ * hugetlb_add_to_page_cache() below happens through the
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
- ret = huge_add_to_page_cache(page, mapping, idx);
+ ret = hugetlb_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
page_in_pagecache = true;
}
- ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
- /*
- * Recheck the i_size after holding PT lock to make sure not
- * to leave any page mapped (as page_mapped()) beyond the end
- * of the i_size (remove_inode_hugepages() is strict about
- * enforcing that). If we bail out here, we'll also leave a
- * page in the radix tree in the vm_shared case beyond the end
- * of the i_size, but remove_inode_hugepages() will take care
- * of it as soon as we drop the hugetlb_fault_mutex_table.
- */
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- ret = -EFAULT;
- if (idx >= size)
+ ret = -EIO;
+ if (PageHWPoison(page))
goto out_release_unlock;
- ret = -EEXIST;
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
* page backing it, then access the page.
*/
+ ret = -EEXIST;
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (page_in_pagecache) {
+ if (page_in_pagecache)
page_dup_file_rmap(page, true);
- } else {
- ClearHPageRestoreReserve(page);
+ else
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
- }
/*
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@ -6105,13 +6335,14 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
- pages[nr] = mem_map_offset(page, nr);
+ pages[nr] = nth_page(page, nr);
if (vmas)
vmas[nr] = vma;
}
}
-static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
+ unsigned int flags, pte_t *pte,
bool *unshare)
{
pte_t pteval = huge_ptep_get(pte);
@@ -6123,13 +6354,69 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
return false;
if (flags & FOLL_WRITE)
return true;
- if (gup_must_unshare(flags, pte_page(pteval))) {
+ if (gup_must_unshare(vma, flags, pte_page(pteval))) {
*unshare = true;
return true;
}
return false;
}
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & huge_page_mask(h);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte, entry;
+
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
+ return NULL;
+
+retry:
+ pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (!pte)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, pte);
+ entry = huge_ptep_get(pte);
+ if (pte_present(entry)) {
+ page = pte_page(entry) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ /*
+ * Note that page may be a sub-page, and with vmemmap
+ * optimizations the page struct may be read only.
+ * try_grab_page() will increase the ref count on the
+ * head page, so this will be OK.
+ *
+ * try_grab_page() should always be able to get the page here,
+ * because we hold the ptl lock and have verified pte_present().
+ */
+ if (try_grab_page(page, flags)) {
+ page = NULL;
+ goto out;
+ }
+ } else {
+ if (is_hugetlb_entry_migration(entry)) {
+ spin_unlock(ptl);
+ __migration_entry_wait_huge(pte, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
+ return page;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -6196,7 +6483,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* directly from any kind of swap entries.
*/
if (absent ||
- __follow_hugetlb_must_fault(flags, pte, &unshare)) {
+ __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;
@@ -6206,9 +6493,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
fault_flags |= FAULT_FLAG_UNSHARE;
- if (locked)
+ if (locked) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
+ if (flags & FOLL_INTERRUPTIBLE)
+ fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+ }
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
@@ -6269,7 +6559,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
- record_subpages_vmas(mem_map_offset(page, pfn_offset),
+ record_subpages_vmas(nth_page(page, pfn_offset),
vma, refs,
likely(pages) ? pages + i : NULL,
vmas ? vmas + i : NULL);
@@ -6282,8 +6572,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* tables. If the huge page is present, then the tail
* pages must also be present. The ptl prevents the
* head page and tail pages from being rearranged in
- * any way. So this page must be available at this
- * point, unless the page refcount overflowed:
+ * any way. As this is hugetlb, the pages will never
+ * be p2pdma or not longterm pinable. So this page
+ * must be available at this point, unless the page
+ * refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
@@ -6340,8 +6632,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
- last_addr_mask = hugetlb_mask_last_page(h);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
@@ -6440,6 +6733,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
@@ -6465,6 +6759,12 @@ bool hugetlb_reserve_pages(struct inode *inode,
}
/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+ hugetlb_vma_lock_alloc(vma);
+
+ /*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
@@ -6487,12 +6787,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, &regions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return false;
+ goto out_err;
chg = to - from;
@@ -6587,6 +6886,7 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_vma_lock_free(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -6656,35 +6956,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
+ *
+ * Also, vma_lock (vm_private_data) is required for sharing.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
- !range_in_vma(svma, sbase, s_end))
+ !range_in_vma(svma, sbase, s_end) ||
+ !svma->vm_private_data)
return 0;
return saddr;
}
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
- return true;
- return false;
-}
-
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
+ unsigned long start = addr & PUD_MASK;
+ unsigned long end = start + PUD_SIZE;
+
#ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
#endif
- return vma_shareable(vma, addr);
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return false;
+ if (!vma->vm_private_data) /* vma lock required for sharing */
+ return false;
+ if (!range_in_vma(vma, start, end))
+ return false;
+ return true;
}
/*
@@ -6718,12 +7020,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode if
- * sharing is possible. For hugetlbfs, this prevents removal of any page
- * table entries associated with the address space. This is important as we
- * are setting up sharing based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
@@ -6737,7 +7037,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *pte;
spinlock_t *ptl;
- i_mmap_assert_locked(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -6767,6 +7067,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_read(mapping);
return pte;
}
@@ -6777,7 +7078,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
@@ -6790,6 +7091,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -6801,6 +7103,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
@@ -6928,123 +7231,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
* These functions are overwritable if your architecture needs its own
* behavior.
*/
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write)
-{
- return ERR_PTR(-EINVAL);
-}
-
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
- WARN(1, "hugepd follow called with no support for hugepage directory format\n");
- return NULL;
-}
-
-struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags)
-{
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t pte;
-
- /*
- * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
- * follow_hugetlb_page().
- */
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
-retry:
- ptl = pmd_lockptr(mm, pmd);
- spin_lock(ptl);
- /*
- * make sure that the address range covered by this pmd is not
- * unmapped from other threads.
- */
- if (!pmd_huge(*pmd))
- goto out;
- pte = huge_ptep_get((pte_t *)pmd);
- if (pte_present(pte)) {
- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
- /*
- * try_grab_page() should always succeed here, because: a) we
- * hold the pmd (ptl) lock, and b) we've just checked that the
- * huge pmd (head) page is present in the page tables. The ptl
- * prevents the head page and tail pages from being rearranged
- * in any way. So this page must be available at this point,
- * unless the page refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge((pte_t *)pmd, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
-{
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t pte;
-
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
-retry:
- ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
- if (!pud_huge(*pud))
- goto out;
- pte = huge_ptep_get((pte_t *)pud);
- if (pte_present(pte)) {
- page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pud, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
- if (flags & (FOLL_GET | FOLL_PIN))
- return NULL;
-
- return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
int isolate_hugetlb(struct page *page, struct list_head *list)
{
int ret = 0;
@@ -7063,7 +7249,7 @@ unlock:
return ret;
}
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
{
int ret = 0;
@@ -7073,7 +7259,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
*hugetlb = true;
if (HPageFreed(page))
ret = 0;
- else if (HPageMigratable(page))
+ else if (HPageMigratable(page) || unpoison)
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
@@ -7082,12 +7268,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
return ret;
}
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
int ret;
spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
@@ -7101,15 +7288,15 @@ void putback_active_hugepage(struct page *page)
put_page(page);
}
-void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
{
- struct hstate *h = page_hstate(oldpage);
+ struct hstate *h = folio_hstate(old_folio);
- hugetlb_cgroup_migrate(oldpage, newpage);
- set_page_owner_migrate_reason(newpage, reason);
+ hugetlb_cgroup_migrate(old_folio, new_folio);
+ set_page_owner_migrate_reason(&new_folio->page, reason);
/*
- * transfer temporary state of the new huge page. This is
+ * transfer temporary state of the new hugetlb folio. This is
* reverse to other transitions because the newpage is going to
* be final while the old one will be freed so it takes over
* the temporary status.
@@ -7118,12 +7305,13 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
- if (HPageTemporary(newpage)) {
- int old_nid = page_to_nid(oldpage);
- int new_nid = page_to_nid(newpage);
+ if (folio_test_hugetlb_temporary(new_folio)) {
+ int old_nid = folio_nid(old_folio);
+ int new_nid = folio_nid(new_folio);
+
+ folio_set_hugetlb_temporary(old_folio);
+ folio_clear_hugetlb_temporary(new_folio);
- SetHPageTemporary(oldpage);
- ClearHPageTemporary(newpage);
/*
* There is no need to transfer the per-node surplus state
@@ -7171,6 +7359,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
ptep = huge_pte_offset(mm, address, sz);
@@ -7182,6 +7371,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7332,7 +7522,7 @@ void __init hugetlb_cma_reserve(int order)
hugetlb_cma_size = 0;
}
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index c86691c431fd..d9e4425d81ac 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -75,11 +75,11 @@ parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
- int idx;
+ struct hstate *h;
- for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ for_each_hstate(h) {
if (page_counter_read(
- hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
+ hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
return true;
}
return false;
@@ -154,9 +154,9 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
* function.
*/
for_each_node(node) {
- /* Set node_to_alloc to -1 for offline nodes. */
+ /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
int node_to_alloc =
- node_state(node, N_NORMAL_MEMORY) ? node : -1;
+ node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
h_cgroup->nodeinfo[node] =
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
GFP_KERNEL, node_to_alloc);
@@ -191,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct page_counter *counter;
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+ struct folio *folio = page_folio(page);
- page_hcg = hugetlb_cgroup_from_page(page);
+ page_hcg = hugetlb_cgroup_from_folio(folio);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
@@ -211,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
/* Take the pages off the local counter */
page_counter_cancel(counter, nr_pages);
- set_hugetlb_cgroup(page, parent);
+ set_hugetlb_cgroup(folio, parent);
out:
return;
}
@@ -225,17 +226,14 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
- int idx;
do {
- idx = 0;
for_each_hstate(h) {
spin_lock_irq(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(idx, h_cg, page);
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
spin_unlock_irq(&hugetlb_lock);
- idx++;
}
cond_resched();
} while (hugetlb_cgroup_have_usage(h_cg));
@@ -312,21 +310,21 @@ int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
/* Should be called with hugetlb_lock held */
static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page, bool rsvd)
+ struct folio *folio, bool rsvd)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
- __set_hugetlb_cgroup(page, h_cg, rsvd);
+ __set_hugetlb_cgroup(folio, h_cg, rsvd);
if (!rsvd) {
unsigned long usage =
- h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
+ h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
- WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
+ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
usage + nr_pages);
}
}
@@ -335,31 +333,35 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page)
{
- __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
+ struct folio *folio = page_folio(page);
+
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
}
void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page)
{
- __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
+ struct folio *folio = page_folio(page);
+
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
}
/*
* Should be called with hugetlb_lock held
*/
-static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page, bool rsvd)
+static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio, bool rsvd)
{
struct hugetlb_cgroup *h_cg;
if (hugetlb_cgroup_disabled())
return;
lockdep_assert_held(&hugetlb_lock);
- h_cg = __hugetlb_cgroup_from_page(page, rsvd);
+ h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
if (unlikely(!h_cg))
return;
- __set_hugetlb_cgroup(page, NULL, rsvd);
+ __set_hugetlb_cgroup(folio, NULL, rsvd);
page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
rsvd),
@@ -369,27 +371,27 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
css_put(&h_cg->css);
else {
unsigned long usage =
- h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
+ h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
- WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
+ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
usage - nr_pages);
}
}
-void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page)
+void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
- __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
+ __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
}
-void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
- struct page *page)
+void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
- __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
+ __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
}
static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
@@ -442,7 +444,7 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
return;
- if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
+ if (rg->reservation_counter && resv->pages_per_hpage &&
!resv->reservation_counter) {
page_counter_uncharge(rg->reservation_counter,
nr_pages * resv->pages_per_hpage);
@@ -675,12 +677,12 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
- if (hsize >= (1UL << 30))
- snprintf(buf, size, "%luGB", hsize >> 30);
- else if (hsize >= (1UL << 20))
- snprintf(buf, size, "%luMB", hsize >> 20);
+ if (hsize >= SZ_1G)
+ snprintf(buf, size, "%luGB", hsize / SZ_1G);
+ else if (hsize >= SZ_1M)
+ snprintf(buf, size, "%luMB", hsize / SZ_1M);
else
- snprintf(buf, size, "%luKB", hsize >> 10);
+ snprintf(buf, size, "%luKB", hsize / SZ_1K);
return buf;
}
@@ -886,25 +888,25 @@ void __init hugetlb_cgroup_file_init(void)
* hugetlb_lock will make sure a parallel cgroup rmdir won't happen
* when we migrate hugepages
*/
-void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
{
struct hugetlb_cgroup *h_cg;
struct hugetlb_cgroup *h_cg_rsvd;
- struct hstate *h = page_hstate(oldhpage);
+ struct hstate *h = folio_hstate(old_folio);
if (hugetlb_cgroup_disabled())
return;
spin_lock_irq(&hugetlb_lock);
- h_cg = hugetlb_cgroup_from_page(oldhpage);
- h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
- set_hugetlb_cgroup(oldhpage, NULL);
- set_hugetlb_cgroup_rsvd(oldhpage, NULL);
+ h_cg = hugetlb_cgroup_from_folio(old_folio);
+ h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
+ set_hugetlb_cgroup(old_folio, NULL);
+ set_hugetlb_cgroup_rsvd(old_folio, NULL);
/* move the h_cg details to new cgroup */
- set_hugetlb_cgroup(newhpage, h_cg);
- set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
- list_move(&newhpage->lru, &h->hugepage_activelist);
+ set_hugetlb_cgroup(new_folio, h_cg);
+ set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
+ list_move(&new_folio->lru, &h->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
return;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 20f414c0379f..45e93a545dd7 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "HugeTLB: " fmt
#include <linux/pgtable.h>
+#include <linux/moduleparam.h>
#include <linux/bootmem_info.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -202,12 +203,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
return ret;
} while (pgd++, addr = next, addr != end);
- /*
- * We only change the mapping of the vmemmap virtual address range
- * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
- * belongs to the range.
- */
- flush_tlb_kernel_range(start + PAGE_SIZE, end);
+ flush_tlb_kernel_range(start, end);
return 0;
}
@@ -231,10 +227,8 @@ static void free_vmemmap_page_list(struct list_head *list)
{
struct page *page, *next;
- list_for_each_entry_safe(page, next, list, lru) {
- list_del(&page->lru);
+ list_for_each_entry_safe(page, next, list, lru)
free_vmemmap_page(page);
- }
}
static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
@@ -245,9 +239,23 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* to the tail pages.
*/
pgprot_t pgprot = PAGE_KERNEL_RO;
- pte_t entry = mk_pte(walk->reuse_page, pgprot);
struct page *page = pte_page(*pte);
+ pte_t entry;
+
+ /* Remapping the head page requires r/w */
+ if (unlikely(addr == walk->reuse_addr)) {
+ pgprot = PAGE_KERNEL;
+ list_del(&walk->reuse_page->lru);
+
+ /*
+ * Makes sure that preceding stores to the page contents from
+ * vmemmap_remap_free() become visible before the set_pte_at()
+ * write.
+ */
+ smp_wmb();
+ }
+ entry = mk_pte(walk->reuse_page, pgprot);
list_add_tail(&page->lru, walk->vmemmap_pages);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -265,11 +273,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
static inline void reset_struct_pages(struct page *start)
{
- int i;
struct page *from = start + NR_RESET_STRUCT_PAGE;
- for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
- memcpy(start + i, from, sizeof(*from));
+ BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
+ memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
}
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
@@ -287,6 +294,11 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
copy_page(to, (void *)walk->reuse_addr);
reset_struct_pages(to);
+ /*
+ * Makes sure that preceding stores to the page contents become visible
+ * before the set_pte_at() write.
+ */
+ smp_wmb();
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}
@@ -312,6 +324,24 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
.reuse_addr = reuse,
.vmemmap_pages = &vmemmap_pages,
};
+ int nid = page_to_nid((struct page *)start);
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
+ __GFP_NOWARN;
+
+ /*
+ * Allocate a new head vmemmap page to avoid breaking a contiguous
+ * block of struct page memory when freeing it back to page allocator
+ * in free_vmemmap_page_list(). This will allow the likely contiguous
+ * struct page backing memory to be kept contiguous and allowing for
+ * more allocations of hugepages. Fallback to the currently
+ * mapped head page in case should it fail to allocate.
+ */
+ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
+ if (walk.reuse_page) {
+ copy_page(page_to_virt(walk.reuse_page),
+ (void *)walk.reuse_addr);
+ list_add(&walk.reuse_page->lru, &vmemmap_pages);
+ }
/*
* In order to make remapping routine most efficient for the huge pages,
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 65e242b5a432..d0548e382b6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -63,13 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
-static void pfn_inject_exit(void)
+static void __exit pfn_inject_exit(void)
{
hwpoison_filter_enable = 0;
debugfs_remove_recursive(hwpoison_dir);
}
-static int pfn_inject_init(void)
+static int __init pfn_inject_init(void)
{
hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fbe7844d0912..c9327abb771c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h>
-#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
@@ -28,7 +28,7 @@
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
- .mm_rb = RB_ROOT,
+ .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 785409805ed7..bcf75a8b032d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,9 +83,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
+void folio_activate(struct folio *folio);
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long floor,
+ unsigned long ceiling);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
@@ -104,9 +106,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
force_page_cache_ra(&ractl, nr_to_read);
}
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
@@ -187,7 +189,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
/*
* in mm/rmap.c:
*/
-extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
@@ -365,7 +367,6 @@ extern int user_min_free_kbytes;
extern void free_unref_page(struct page *page, unsigned int order);
extern void free_unref_page_list(struct list_head *list);
-extern void zone_pcp_update(struct zone *zone, int cpu_online);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -479,9 +480,6 @@ static inline bool is_data_mapping(vm_flags_t flags)
}
/* mm/util.c */
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev);
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
struct anon_vma *folio_anon_vma(struct folio *folio);
#ifdef CONFIG_MMU
@@ -639,34 +637,6 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
}
#endif /* !CONFIG_MMU */
-/*
- * Return the mem_map entry representing the 'offset' subpage within
- * the maximally aligned gigantic page 'base'. Handle any discontiguity
- * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
- */
-static inline struct page *mem_map_offset(struct page *base, int offset)
-{
- if (unlikely(offset >= MAX_ORDER_NR_PAGES))
- return nth_page(base, offset);
- return base + offset;
-}
-
-/*
- * Iterator over all subpages within the maximally aligned gigantic
- * page 'base'. Handle any discontiguity in the mem_map.
- */
-static inline struct page *mem_map_next(struct page *iter,
- struct page *base, int offset)
-{
- if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
- unsigned long pfn = page_to_pfn(base) + offset;
- if (!pfn_valid(pfn))
- return NULL;
- return pfn_to_page(pfn);
- }
- return iter + 1;
-}
-
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
@@ -738,14 +708,6 @@ extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
-#ifdef CONFIG_MEMORY_FAILURE
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages);
-#else
-static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
-}
-#endif
-
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
@@ -847,8 +809,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
}
#endif
+int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift);
+
void vunmap_range_noflush(unsigned long start, unsigned long end);
+void __vunmap_range_noflush(unsigned long start, unsigned long end);
+
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
@@ -860,8 +828,6 @@ int migrate_device_coherent_page(struct page *page);
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
-DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-
extern bool mirrored_kernelcore;
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1f84df9c302e..d4837bff3b60 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla)
+
+CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST)
+CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
+
obj-y := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
+
+obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o
+obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 69f583855c8b..833bf2cfd2a3 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -30,13 +30,20 @@
#include "kasan.h"
#include "../slab.h"
+struct slab *kasan_addr_to_slab(const void *addr)
+{
+ if (virt_addr_valid(addr))
+ return virt_to_slab(addr);
+ return NULL;
+}
+
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
- return __stack_depot_save(entries, nr_entries, flags, can_alloc);
+ return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
@@ -88,17 +95,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
}
#endif /* CONFIG_KASAN_STACK */
-/*
- * Only allow cache merging when stack collection is disabled and no metadata
- * is present.
- */
-slab_flags_t __kasan_never_merge(void)
-{
- if (kasan_stack_collection_enabled())
- return SLAB_KASAN;
- return 0;
-}
-
void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
@@ -121,132 +117,11 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
KASAN_PAGE_FREE, init);
}
-/*
- * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
- * For larger allocations larger redzones are used.
- */
-static inline unsigned int optimal_redzone(unsigned int object_size)
-{
- return
- object_size <= 64 - 16 ? 16 :
- object_size <= 128 - 32 ? 32 :
- object_size <= 512 - 64 ? 64 :
- object_size <= 4096 - 128 ? 128 :
- object_size <= (1 << 14) - 256 ? 256 :
- object_size <= (1 << 15) - 512 ? 512 :
- object_size <= (1 << 16) - 1024 ? 1024 : 2048;
-}
-
-void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags)
-{
- unsigned int ok_size;
- unsigned int optimal_size;
-
- /*
- * SLAB_KASAN is used to mark caches as ones that are sanitized by
- * KASAN. Currently this flag is used in two places:
- * 1. In slab_ksize() when calculating the size of the accessible
- * memory within the object.
- * 2. In slab_common.c to prevent merging of sanitized caches.
- */
- *flags |= SLAB_KASAN;
-
- if (!kasan_stack_collection_enabled())
- return;
-
- ok_size = *size;
-
- /* Add alloc meta into redzone. */
- cache->kasan_info.alloc_meta_offset = *size;
- *size += sizeof(struct kasan_alloc_meta);
-
- /*
- * If alloc meta doesn't fit, don't add it.
- * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
- * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
- * larger sizes.
- */
- if (*size > KMALLOC_MAX_SIZE) {
- cache->kasan_info.alloc_meta_offset = 0;
- *size = ok_size;
- /* Continue, since free meta might still fit. */
- }
-
- /* Only the generic mode uses free meta or flexible redzones. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
- return;
- }
-
- /*
- * Add free meta into redzone when it's not possible to store
- * it in the object. This is the case when:
- * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
- * be touched after it was freed, or
- * 2. Object has a constructor, which means it's expected to
- * retain its content until the next allocation, or
- * 3. Object is too small.
- * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
- */
- if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta)) {
- ok_size = *size;
-
- cache->kasan_info.free_meta_offset = *size;
- *size += sizeof(struct kasan_free_meta);
-
- /* If free meta doesn't fit, don't add it. */
- if (*size > KMALLOC_MAX_SIZE) {
- cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
- *size = ok_size;
- }
- }
-
- /* Calculate size with optimal redzone. */
- optimal_size = cache->object_size + optimal_redzone(cache->object_size);
- /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
- if (optimal_size > KMALLOC_MAX_SIZE)
- optimal_size = KMALLOC_MAX_SIZE;
- /* Use optimal size if the size with added metas is not large enough. */
- if (*size < optimal_size)
- *size = optimal_size;
-}
-
void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
{
cache->kasan_info.is_kmalloc = true;
}
-size_t __kasan_metadata_size(struct kmem_cache *cache)
-{
- if (!kasan_stack_collection_enabled())
- return 0;
- return (cache->kasan_info.alloc_meta_offset ?
- sizeof(struct kasan_alloc_meta) : 0) +
- (cache->kasan_info.free_meta_offset ?
- sizeof(struct kasan_free_meta) : 0);
-}
-
-struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
- const void *object)
-{
- if (!cache->kasan_info.alloc_meta_offset)
- return NULL;
- return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset;
-}
-
-#ifdef CONFIG_KASAN_GENERIC
-struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
- return NULL;
- return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
-}
-#endif
-
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -312,13 +187,9 @@ static inline u8 assign_tag(struct kmem_cache *cache,
void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
const void *object)
{
- struct kasan_alloc_meta *alloc_meta;
-
- if (kasan_stack_collection_enabled()) {
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta)
- __memset(alloc_meta, 0, sizeof(*alloc_meta));
- }
+ /* Initialize per-object metadata if it is present. */
+ if (kasan_requires_meta())
+ kasan_init_object_meta(cache, object);
/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
object = set_tag(object, assign_tag(cache, object, true));
@@ -329,13 +200,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
unsigned long ip, bool quarantine, bool init)
{
- u8 tag;
void *tagged_object;
if (!kasan_arch_is_ready())
return false;
- tag = get_tag(object);
tagged_object = object;
object = kasan_reset_tag(object);
@@ -364,7 +233,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
return false;
if (kasan_stack_collection_enabled())
- kasan_set_free_info(cache, object, tag);
+ kasan_save_free_info(cache, tagged_object);
return kasan_quarantine_put(cache, object);
}
@@ -423,20 +292,6 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
}
}
-static void set_alloc_info(struct kmem_cache *cache, void *object,
- gfp_t flags, bool is_kmalloc)
-{
- struct kasan_alloc_meta *alloc_meta;
-
- /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */
- if (cache->kasan_info.is_kmalloc && !is_kmalloc)
- return;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta)
- kasan_set_track(&alloc_meta->alloc_track, flags);
-}
-
void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
void *object, gfp_t flags, bool init)
{
@@ -466,8 +321,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
- if (kasan_stack_collection_enabled())
- set_alloc_info(cache, (void *)object, flags, false);
+ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc)
+ kasan_save_alloc_info(cache, tagged_object, flags);
return tagged_object;
}
@@ -512,8 +367,8 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
* Save alloc info (if possible) for kmalloc() allocations.
* This also rewrites the alloc info when called from kasan_krealloc().
*/
- if (kasan_stack_collection_enabled())
- set_alloc_info(cache, (void *)object, flags, true);
+ if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc)
+ kasan_save_alloc_info(cache, (void *)object, flags);
/* Keep the tag that was set by kasan_slab_alloc(). */
return (void *)object;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 437fcc7e77cf..b076f597a378 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -328,6 +328,146 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
+/* Only allow cache merging when no per-object metadata is present. */
+slab_flags_t kasan_never_merge(void)
+{
+ if (!kasan_requires_meta())
+ return 0;
+ return SLAB_KASAN;
+}
+
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static inline unsigned int optimal_redzone(unsigned int object_size)
+{
+ return
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
+{
+ unsigned int ok_size;
+ unsigned int optimal_size;
+
+ if (!kasan_requires_meta())
+ return;
+
+ /*
+ * SLAB_KASAN is used to mark caches that are sanitized by KASAN
+ * and that thus have per-object metadata.
+ * Currently this flag is used in two places:
+ * 1. In slab_ksize() to account for per-object metadata when
+ * calculating the size of the accessible memory within the object.
+ * 2. In slab_common.c via kasan_never_merge() to prevent merging of
+ * caches with per-object metadata.
+ */
+ *flags |= SLAB_KASAN;
+
+ ok_size = *size;
+
+ /* Add alloc meta into redzone. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /*
+ * If alloc meta doesn't fit, don't add it.
+ * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
+ * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
+ * larger sizes.
+ */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.alloc_meta_offset = 0;
+ *size = ok_size;
+ /* Continue, since free meta might still fit. */
+ }
+
+ /*
+ * Add free meta into redzone when it's not possible to store
+ * it in the object. This is the case when:
+ * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
+ * be touched after it was freed, or
+ * 2. Object has a constructor, which means it's expected to
+ * retain its content until the next allocation, or
+ * 3. Object is too small.
+ * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
+ */
+ if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ ok_size = *size;
+
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+
+ /* If free meta doesn't fit, don't add it. */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
+ *size = ok_size;
+ }
+ }
+
+ /* Calculate size with optimal redzone. */
+ optimal_size = cache->object_size + optimal_redzone(cache->object_size);
+ /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
+ if (optimal_size > KMALLOC_MAX_SIZE)
+ optimal_size = KMALLOC_MAX_SIZE;
+ /* Use optimal size if the size with added metas is not large enough. */
+ if (*size < optimal_size)
+ *size = optimal_size;
+}
+
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object)
+{
+ if (!cache->kasan_info.alloc_meta_offset)
+ return NULL;
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
+ return NULL;
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+
+void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ __memset(alloc_meta, 0, sizeof(*alloc_meta));
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
+{
+ struct kasan_cache *info = &cache->kasan_info;
+
+ if (!kasan_requires_meta())
+ return 0;
+
+ if (in_object)
+ return (info->free_meta_offset ?
+ 0 : sizeof(struct kasan_free_meta));
+ else
+ return (info->alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ ((info->free_meta_offset &&
+ info->free_meta_offset != KASAN_NO_FREE_META) ?
+ sizeof(struct kasan_free_meta) : 0);
+}
+
static void __kasan_record_aux_stack(void *addr, bool can_alloc)
{
struct slab *slab = kasan_addr_to_slab(addr);
@@ -358,8 +498,16 @@ void kasan_record_aux_stack_noalloc(void *addr)
return __kasan_record_aux_stack(addr, false);
}
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->alloc_track, flags);
+}
+
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
struct kasan_free_meta *free_meta;
@@ -371,12 +519,3 @@ void kasan_set_free_info(struct kmem_cache *cache,
/* The object was freed and has free track set. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK;
}
-
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK)
- return NULL;
- /* Free meta must be present with KASAN_SLAB_FREETRACK. */
- return &kasan_get_free_meta(cache, object)->free_track;
-}
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9ad8eff71b28..b22c4f461cb0 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -38,16 +38,9 @@ enum kasan_arg_vmalloc {
KASAN_ARG_VMALLOC_ON,
};
-enum kasan_arg_stacktrace {
- KASAN_ARG_STACKTRACE_DEFAULT,
- KASAN_ARG_STACKTRACE_OFF,
- KASAN_ARG_STACKTRACE_ON,
-};
-
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
-static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
/*
* Whether KASAN is enabled at all.
@@ -66,9 +59,6 @@ EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to enable vmalloc tagging. */
DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
-/* Whether to collect alloc/free stack traces. */
-DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
-
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -122,23 +112,6 @@ static int __init early_kasan_flag_vmalloc(char *arg)
}
early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
-/* kasan.stacktrace=off/on */
-static int __init early_kasan_flag_stacktrace(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "off"))
- kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
- else if (!strcmp(arg, "on"))
- kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
- else
- return -EINVAL;
-
- return 0;
-}
-early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-
static inline const char *kasan_mode_info(void)
{
if (kasan_mode == KASAN_MODE_ASYNC)
@@ -213,17 +186,7 @@ void __init kasan_init_hw_tags(void)
break;
}
- switch (kasan_arg_stacktrace) {
- case KASAN_ARG_STACKTRACE_DEFAULT:
- /* Default is specified by kasan_flag_stacktrace definition. */
- break;
- case KASAN_ARG_STACKTRACE_OFF:
- static_branch_disable(&kasan_flag_stacktrace);
- break;
- case KASAN_ARG_STACKTRACE_ON:
- static_branch_enable(&kasan_flag_stacktrace);
- break;
- }
+ kasan_init_tags();
/* KASAN is now initialized, enable it. */
static_branch_enable(&kasan_flag_enabled);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 01c03e45acd4..ea8cf1310b1e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,18 +2,37 @@
#ifndef __MM_KASAN_KASAN_H
#define __MM_KASAN_KASAN_H
+#include <linux/atomic.h>
#include <linux/kasan.h>
#include <linux/kasan-tags.h>
#include <linux/kfence.h>
#include <linux/stackdepot.h>
-#ifdef CONFIG_KASAN_HW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
#include <linux/static_key.h>
+
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
+
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return static_branch_unlikely(&kasan_flag_stacktrace);
+}
+
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return true;
+}
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_HW_TAGS
+
#include "../slab.h"
DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
-DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
enum kasan_mode {
KASAN_MODE_SYNC,
@@ -28,11 +47,6 @@ static inline bool kasan_vmalloc_enabled(void)
return static_branch_likely(&kasan_flag_vmalloc);
}
-static inline bool kasan_stack_collection_enabled(void)
-{
- return static_branch_unlikely(&kasan_flag_stacktrace);
-}
-
static inline bool kasan_async_fault_possible(void)
{
return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM;
@@ -43,12 +57,7 @@ static inline bool kasan_sync_fault_possible(void)
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
-#else
-
-static inline bool kasan_stack_collection_enabled(void)
-{
- return true;
-}
+#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_async_fault_possible(void)
{
@@ -60,7 +69,31 @@ static inline bool kasan_sync_fault_possible(void)
return true;
}
-#endif
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_GENERIC
+
+/* Generic KASAN uses per-object metadata to store stack traces. */
+static inline bool kasan_requires_meta(void)
+{
+ /*
+ * Technically, Generic KASAN always collects stack traces right now.
+ * However, let's use kasan_stack_collection_enabled() in case the
+ * kasan.stacktrace command-line argument is changed to affect
+ * Generic KASAN.
+ */
+ return kasan_stack_collection_enabled();
+}
+
+#else /* CONFIG_KASAN_GENERIC */
+
+/* Tag-based KASAN modes do not use per-object metadata. */
+static inline bool kasan_requires_meta(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_KASAN_GENERIC */
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
@@ -122,6 +155,13 @@ static inline bool kasan_sync_fault_possible(void)
#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
#define META_ROWS_AROUND_ADDR 2
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
enum kasan_report_type {
KASAN_REPORT_ACCESS,
KASAN_REPORT_INVALID_FREE,
@@ -129,12 +169,22 @@ enum kasan_report_type {
};
struct kasan_report_info {
+ /* Filled in by kasan_report_*(). */
enum kasan_report_type type;
void *access_addr;
- void *first_bad_addr;
size_t access_size;
bool is_write;
unsigned long ip;
+
+ /* Filled in by the common reporting code. */
+ void *first_bad_addr;
+ struct kmem_cache *cache;
+ void *object;
+
+ /* Filled in by the mode-specific reporting code. */
+ const char *bug_type;
+ struct kasan_track alloc_track;
+ struct kasan_track free_track;
};
/* Do not change the struct layout: compiler ABI. */
@@ -160,33 +210,14 @@ struct kasan_global {
#endif
};
-/* Structures for keeping alloc and free tracks. */
-
-#define KASAN_STACK_DEPTH 64
+/* Structures for keeping alloc and free meta. */
-struct kasan_track {
- u32 pid;
- depot_stack_handle_t stack;
-};
-
-#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS)
-#define KASAN_NR_FREE_STACKS 5
-#else
-#define KASAN_NR_FREE_STACKS 1
-#endif
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_alloc_meta {
struct kasan_track alloc_track;
- /* Generic mode stores free track in kasan_free_meta. */
-#ifdef CONFIG_KASAN_GENERIC
+ /* Free track is stored in kasan_free_meta. */
depot_stack_handle_t aux_stack[2];
-#else
- struct kasan_track free_track[KASAN_NR_FREE_STACKS];
-#endif
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
- u8 free_track_idx;
-#endif
};
struct qlist_node {
@@ -205,26 +236,30 @@ struct qlist_node {
* After that, slab allocator stores the freelist pointer in the object.
*/
struct kasan_free_meta {
-#ifdef CONFIG_KASAN_GENERIC
struct qlist_node quarantine_link;
struct kasan_track free_track;
-#endif
};
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-/* Used in KUnit-compatible KASAN tests. */
-struct kunit_kasan_status {
- bool report_found;
- bool sync_fault;
+#endif /* CONFIG_KASAN_GENERIC */
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+
+struct kasan_stack_ring_entry {
+ void *ptr;
+ size_t size;
+ u32 pid;
+ depot_stack_handle_t stack;
+ bool is_free;
};
-#endif
-struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
- const void *object);
-#ifdef CONFIG_KASAN_GENERIC
-struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
- const void *object);
-#endif
+struct kasan_stack_ring {
+ rwlock_t lock;
+ size_t size;
+ atomic64_t pos;
+ struct kasan_stack_ring_entry *entries;
+};
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
@@ -260,34 +295,50 @@ static inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+void *kasan_find_first_bad_addr(void *addr, size_t size);
+void kasan_complete_mode_report_info(struct kasan_report_info *info);
+void kasan_metadata_fetch_row(char *buffer, void *row);
+
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
void kasan_print_tags(u8 addr_tag, const void *addr);
#else
static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
-void *kasan_find_first_bad_addr(void *addr, size_t size);
-const char *kasan_get_bug_type(struct kasan_report_info *info);
-void kasan_metadata_fetch_row(char *buffer, void *row);
-
#if defined(CONFIG_KASAN_STACK)
void kasan_print_address_stack_frame(const void *addr);
#else
static inline void kasan_print_address_stack_frame(const void *addr) { }
#endif
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object);
+#else
+static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { }
+#endif
+
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
-struct page *kasan_addr_to_page(const void *addr);
struct slab *kasan_addr_to_slab(const void *addr);
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size);
+void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object);
+#else
+static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { }
+static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
+#endif
+
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
void kasan_set_track(struct kasan_track *track, gfp_t flags);
-void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag);
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
+void kasan_save_free_info(struct kmem_cache *cache, void *object);
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
@@ -358,6 +409,10 @@ static inline void kasan_enable_tagging(void) { }
#endif /* CONFIG_KASAN_HW_TAGS */
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+void __init kasan_init_tags(void);
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_force_async_fault(void);
@@ -486,6 +541,18 @@ static inline bool kasan_arch_is_ready(void) { return true; }
#error kasan_arch_is_ready only works in KASAN generic outline mode!
#endif
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_kunit_test_suite_start(void);
+void kasan_kunit_test_suite_end(void);
+
+#else /* CONFIG_KASAN_KUNIT_TEST */
+
+static inline void kasan_kunit_test_suite_start(void) { }
+static inline void kasan_kunit_test_suite_end(void) { }
+
+#endif /* CONFIG_KASAN_KUNIT_TEST */
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
bool kasan_save_enable_multi_shot(void);
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
new file mode 100644
index 000000000000..74cd80c12b25
--- /dev/null
+++ b/mm/kasan/kasan_test.c
@@ -0,0 +1,1570 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#define pr_fmt(fmt) "kasan_test: " fmt
+
+#include <kunit/test.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <trace/events/printk.h>
+
+#include <asm/page.h>
+
+#include "kasan.h"
+
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+
+static bool multishot;
+
+/* Fields set based on lines observed in the console. */
+static struct {
+ bool report_found;
+ bool async_fault;
+} test_status;
+
+/*
+ * Some tests use these global variables to store return values from function
+ * calls that could otherwise be eliminated by the compiler as dead code.
+ */
+void *kasan_ptr_result;
+int kasan_int_result;
+
+/* Probe for console output: obtains test_status lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ if (strnstr(buf, "BUG: KASAN: ", len))
+ WRITE_ONCE(test_status.report_found, true);
+ else if (strnstr(buf, "Asynchronous fault: ", len))
+ WRITE_ONCE(test_status.async_fault, true);
+}
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+static int kasan_suite_init(struct kunit_suite *suite)
+{
+ if (!kasan_enabled()) {
+ pr_err("Can't run KASAN tests with KASAN disabled");
+ return -1;
+ }
+
+ /* Stop failing KUnit tests on KASAN reports. */
+ kasan_kunit_test_suite_start();
+
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
+ multishot = kasan_save_enable_multi_shot();
+
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
+ return 0;
+}
+
+static void kasan_suite_exit(struct kunit_suite *suite)
+{
+ kasan_kunit_test_suite_end();
+ kasan_restore_multi_shot(multishot);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static void kasan_test_exit(struct kunit *test)
+{
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found));
+}
+
+/**
+ * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a
+ * KASAN report; causes a test failure otherwise. This relies on a KUnit
+ * resource named "kasan_status". Do not use this name for KUnit resources
+ * outside of KASAN tests.
+ *
+ * For hardware tag-based KASAN, when a synchronous tag fault happens, tag
+ * checking is auto-disabled. When this happens, this test handler reenables
+ * tag checking. As tag checking can be only disabled or enabled per CPU,
+ * this handler disables migration (preemption).
+ *
+ * Since the compiler doesn't see that the expression can change the test_status
+ * fields, it can reorder or optimize away the accesses to those fields.
+ * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
+ * expression to prevent that.
+ *
+ * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept
+ * as false. This allows detecting KASAN reports that happen outside of the
+ * checks by asserting !test_status.report_found at the start of
+ * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
+ kasan_sync_fault_possible()) \
+ migrate_disable(); \
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \
+ barrier(); \
+ expression; \
+ barrier(); \
+ if (kasan_async_fault_possible()) \
+ kasan_force_async_fault(); \
+ if (!READ_ONCE(test_status.report_found)) { \
+ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \
+ "expected in \"" #expression \
+ "\", but none occurred"); \
+ } \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
+ kasan_sync_fault_possible()) { \
+ if (READ_ONCE(test_status.report_found) && \
+ !READ_ONCE(test_status.async_fault)) \
+ kasan_enable_tagging(); \
+ migrate_enable(); \
+ } \
+ WRITE_ONCE(test_status.report_found, false); \
+ WRITE_ONCE(test_status.async_fault, false); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
+ if (!IS_ENABLED(config)) \
+ kunit_skip((test), "Test requires " #config "=y"); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \
+ if (IS_ENABLED(config)) \
+ kunit_skip((test), "Test requires " #config "=n"); \
+} while (0)
+
+static void kmalloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE - 5;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ /*
+ * An unaligned access past the requested kmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x');
+
+ /*
+ * An aligned access into the first out-of-bounds granule that falls
+ * within the aligned kmalloc object.
+ */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
+
+ /* Out-of-bounds access past the aligned kmalloc object. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
+ ptr[size + KASAN_GRANULE_SIZE + 5]);
+
+ kfree(ptr);
+}
+
+static void kmalloc_oob_left(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 15;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1));
+ kfree(ptr);
+}
+
+static void kmalloc_node_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 4096;
+
+ ptr = kmalloc_node(size, GFP_KERNEL, 0);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ kfree(ptr);
+}
+
+/*
+ * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't
+ * fit into a slab cache and therefore is allocated via the page allocator
+ * fallback. Since this kind of fallback is only implemented for SLUB, these
+ * tests are limited to that allocator.
+ */
+static void kmalloc_pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0);
+
+ kfree(ptr);
+}
+
+static void kmalloc_pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ kfree(ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+}
+
+static void kmalloc_pagealloc_invalid_free(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1));
+}
+
+static void pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+ size_t size = (1UL << (PAGE_SHIFT + order));
+
+ /*
+ * With generic KASAN page allocations have no redzones, thus
+ * out-of-bounds detection is not guaranteed.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=210503.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ free_pages((unsigned long)ptr, order);
+}
+
+static void pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ free_pages((unsigned long)ptr, order);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+}
+
+static void kmalloc_large_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
+
+ /*
+ * Allocate a chunk that is large enough, but still fits into a slab
+ * and does not trigger the page allocator fallback in SLUB.
+ */
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+ kfree(ptr);
+}
+
+static void krealloc_more_oob_helper(struct kunit *test,
+ size_t size1, size_t size2)
+{
+ char *ptr1, *ptr2;
+ size_t middle;
+
+ KUNIT_ASSERT_LT(test, size1, size2);
+ middle = size1 + (size2 - size1) / 2;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
+ /* All offsets up to size2 must be accessible. */
+ ptr2[size1 - 1] = 'x';
+ ptr2[size1] = 'x';
+ ptr2[middle] = 'x';
+ ptr2[size2 - 1] = 'x';
+
+ /* Generic mode is precise, so unaligned size2 must be inaccessible. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x');
+
+ /* For all modes first aligned offset after size2 must be inaccessible. */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x');
+
+ kfree(ptr2);
+}
+
+static void krealloc_less_oob_helper(struct kunit *test,
+ size_t size1, size_t size2)
+{
+ char *ptr1, *ptr2;
+ size_t middle;
+
+ KUNIT_ASSERT_LT(test, size2, size1);
+ middle = size2 + (size1 - size2) / 2;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
+ /* Must be accessible for all modes. */
+ ptr2[size2 - 1] = 'x';
+
+ /* Generic mode is precise, so unaligned size2 must be inaccessible. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x');
+
+ /* For all modes first aligned offset after size2 must be inaccessible. */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x');
+
+ /*
+ * For all modes all size2, middle, and size1 should land in separate
+ * granules and thus the latter two offsets should be inaccessible.
+ */
+ KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE),
+ round_down(middle, KASAN_GRANULE_SIZE));
+ KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE),
+ round_down(size1, KASAN_GRANULE_SIZE));
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x');
+
+ kfree(ptr2);
+}
+
+static void krealloc_more_oob(struct kunit *test)
+{
+ krealloc_more_oob_helper(test, 201, 235);
+}
+
+static void krealloc_less_oob(struct kunit *test)
+{
+ krealloc_less_oob_helper(test, 235, 201);
+}
+
+static void krealloc_pagealloc_more_oob(struct kunit *test)
+{
+ /* page_alloc fallback in only implemented for SLUB. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201,
+ KMALLOC_MAX_CACHE_SIZE + 235);
+}
+
+static void krealloc_pagealloc_less_oob(struct kunit *test)
+{
+ /* page_alloc fallback in only implemented for SLUB. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235,
+ KMALLOC_MAX_CACHE_SIZE + 201);
+}
+
+/*
+ * Check that krealloc() detects a use-after-free, returns NULL,
+ * and doesn't unpoison the freed object.
+ */
+static void krealloc_uaf(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ int size1 = 201;
+ int size2 = 235;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+ kfree(ptr1);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL));
+ KUNIT_ASSERT_NULL(test, ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1);
+}
+
+static void kmalloc_oob_16(struct kunit *test)
+{
+ struct {
+ u64 words[2];
+ } *ptr1, *ptr2;
+
+ /* This test is specifically crafted for the generic mode. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ OPTIMIZER_HIDE_VAR(ptr1);
+ OPTIMIZER_HIDE_VAR(ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ kfree(ptr1);
+ kfree(ptr2);
+}
+
+static void kmalloc_uaf_16(struct kunit *test)
+{
+ struct {
+ u64 words[2];
+ } *ptr1, *ptr2;
+
+ ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ kfree(ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ kfree(ptr1);
+}
+
+/*
+ * Note: in the memset tests below, the written range touches both valid and
+ * invalid memory. This makes sure that the instrumentation does not only check
+ * the starting address but the whole range.
+ */
+
+static void kmalloc_oob_memset_2(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_4(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_8(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_16(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_in_memset(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memset(ptr, 0, size + KASAN_GRANULE_SIZE));
+ kfree(ptr);
+}
+
+static void kmalloc_memmove_negative_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ size_t invalid_size = -2;
+
+ /*
+ * Hardware tag-based mode doesn't check memmove for negative size.
+ * As a result, this test introduces a side-effect memory corruption,
+ * which can result in a crash.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
+static void kmalloc_memmove_invalid_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ size_t invalid_size = size;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
+static void kmalloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 10;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
+}
+
+static void kmalloc_uaf_memset(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 33;
+
+ /*
+ * Only generic KASAN uses quarantine, which is required to avoid a
+ * kernel memory corruption this test causes.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size));
+}
+
+static void kmalloc_uaf2(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ size_t size = 43;
+ int counter = 0;
+
+again:
+ ptr1 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ kfree(ptr1);
+
+ ptr2 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /*
+ * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same.
+ * Allow up to 16 attempts at generating different tags.
+ */
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) {
+ kfree(ptr2);
+ goto again;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
+ KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
+
+ kfree(ptr2);
+}
+
+/*
+ * Check that KASAN detects use-after-free when another object was allocated in
+ * the same slot. Relevant for the tag-based modes, which do not use quarantine.
+ */
+static void kmalloc_uaf3(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ size_t size = 100;
+
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr1 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+ kfree(ptr1);
+
+ ptr2 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ kfree(ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
+}
+
+static void kfree_via_page(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 8;
+ struct page *page;
+ unsigned long offset;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ page = virt_to_page(ptr);
+ offset = offset_in_page(ptr);
+ kfree(page_address(page) + offset);
+}
+
+static void kfree_via_phys(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 8;
+ phys_addr_t phys;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ phys = virt_to_phys(ptr);
+ kfree(phys_to_virt(phys));
+}
+
+static void kmem_cache_oob(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
+
+ kmem_cache_free(cache, p);
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_accounted(struct kunit *test)
+{
+ int i;
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ /*
+ * Several allocations with a delay to allow for lazy per memcg kmem
+ * cache creation.
+ */
+ for (i = 0; i < 5; i++) {
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p)
+ goto free_cache;
+
+ kmem_cache_free(cache, p);
+ msleep(100);
+ }
+
+free_cache:
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_bulk(struct kunit *test)
+{
+ struct kmem_cache *cache;
+ size_t size = 200;
+ char *p[10];
+ bool ret;
+ int i;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p);
+ if (!ret) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(p); i++)
+ p[i][0] = p[i][size - 1] = 42;
+
+ kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p);
+ kmem_cache_destroy(cache);
+}
+
+static char global_array[10];
+
+static void kasan_global_oob_right(struct kunit *test)
+{
+ /*
+ * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS
+ * from failing here and panicking the kernel, access the array via a
+ * volatile pointer, which will prevent the compiler from being able to
+ * determine the array bounds.
+ *
+ * This access uses a volatile pointer to char (char *volatile) rather
+ * than the more conventional pointer to volatile char (volatile char *)
+ * because we want to prevent the compiler from making inferences about
+ * the pointer itself (i.e. its array bounds), not the data that it
+ * refers to.
+ */
+ char *volatile array = global_array;
+ char *p = &array[ARRAY_SIZE(global_array) + 3];
+
+ /* Only generic mode instruments globals. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_global_oob_left(struct kunit *test)
+{
+ char *volatile array = global_array;
+ char *p = array - 3;
+
+ /*
+ * GCC is known to fail this test, skip it.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=215051.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+/* Check that ksize() does NOT unpoison whole object. */
+static void ksize_unpoisons_memory(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE - 5;
+ size_t real_size;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ real_size = ksize(ptr);
+ KUNIT_EXPECT_GT(test, real_size, size);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+
+ /* These accesses shouldn't trigger a KASAN report. */
+ ptr[0] = 'x';
+ ptr[size - 1] = 'x';
+
+ /* These must trigger a KASAN report. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]);
+
+ kfree(ptr);
+}
+
+/*
+ * Check that a use-after-free is detected by ksize() and via normal accesses
+ * after it.
+ */
+static void ksize_uaf(struct kunit *test)
+{
+ char *ptr;
+ int size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ kfree(ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+}
+
+static void kasan_stack_oob(struct kunit *test)
+{
+ char stack_array[10];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = stack_array;
+ char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF];
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_alloca_oob_left(struct kunit *test)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = alloca_array;
+ char *p = array - 1;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_alloca_oob_right(struct kunit *test)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = alloca_array;
+ char *p = array + i;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kmem_cache_double_free(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ kmem_cache_free(cache, p);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p));
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_invalid_free(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ /* Trigger invalid free, the object doesn't get freed. */
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1));
+
+ /*
+ * Properly free the object to prevent the "Objects remaining in
+ * test_cache on __kmem_cache_shutdown" BUG failure.
+ */
+ kmem_cache_free(cache, p);
+
+ kmem_cache_destroy(cache);
+}
+
+static void empty_cache_ctor(void *object) { }
+
+static void kmem_cache_double_destroy(struct kunit *test)
+{
+ struct kmem_cache *cache;
+
+ /* Provide a constructor to prevent cache merging. */
+ cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+ kmem_cache_destroy(cache);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
+}
+
+static void kasan_memchr(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ if (OOB_TAG_OFF)
+ size = round_up(size, OOB_TAG_OFF);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_ptr_result = memchr(ptr, '1', size + 1));
+
+ kfree(ptr);
+}
+
+static void kasan_memcmp(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+ int arr[9];
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ if (OOB_TAG_OFF)
+ size = round_up(size, OOB_TAG_OFF);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ memset(arr, 0, sizeof(arr));
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_int_result = memcmp(ptr, arr, size+1));
+ kfree(ptr);
+}
+
+static void kasan_strings(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+
+ /*
+ * Try to cause only 1 invalid access (less spam in dmesg).
+ * For that we need ptr to point to zeroed byte.
+ * Skip metadata that could be stored in freed object so ptr
+ * will likely point to zeroed byte.
+ */
+ ptr += 16;
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1'));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1'));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2"));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1));
+}
+
+static void kasan_bitops_modify(struct kunit *test, int nr, void *addr)
+{
+ KUNIT_EXPECT_KASAN_FAIL(test, set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(nr, addr));
+}
+
+static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
+{
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
+
+#if defined(clear_bit_unlock_is_negative_byte)
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
+ clear_bit_unlock_is_negative_byte(nr, addr));
+#endif
+}
+
+static void kasan_bitops_generic(struct kunit *test)
+{
+ long *bits;
+
+ /* This test is specifically crafted for the generic mode. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ /*
+ * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes;
+ * this way we do not actually corrupt other memory.
+ */
+ bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
+
+ /*
+ * Below calls try to access bit within allocated memory; however, the
+ * below accesses are still out-of-bounds, since bitops are defined to
+ * operate on the whole long the bit is in.
+ */
+ kasan_bitops_modify(test, BITS_PER_LONG, bits);
+
+ /*
+ * Below calls try to access bit beyond allocated memory.
+ */
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, bits);
+
+ kfree(bits);
+}
+
+static void kasan_bitops_tags(struct kunit *test)
+{
+ long *bits;
+
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */
+ bits = kzalloc(48, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
+
+ /* Do the accesses past the 48 allocated bytes, but within the redone. */
+ kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48);
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48);
+
+ kfree(bits);
+}
+
+static void kmalloc_double_kzfree(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 16;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree_sensitive(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
+}
+
+/*
+ * The two tests below check that Generic KASAN prints auxiliary stack traces
+ * for RCU callbacks and workqueues. The reports need to be inspected manually.
+ *
+ * These tests are still enabled for other KASAN modes to make sure that all
+ * modes report bad accesses in tested scenarios.
+ */
+
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static void rcu_uaf_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp =
+ container_of(rp, struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ ((volatile struct kasan_rcu_info *)fp)->i;
+}
+
+static void rcu_uaf(struct kunit *test)
+{
+ struct kasan_rcu_info *ptr;
+
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ global_rcu_ptr = rcu_dereference_protected(
+ (struct kasan_rcu_info __rcu *)ptr, NULL);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
+}
+
+static void workqueue_uaf_work(struct work_struct *work)
+{
+ kfree(work);
+}
+
+static void workqueue_uaf(struct kunit *test)
+{
+ struct workqueue_struct *workqueue;
+ struct work_struct *work;
+
+ workqueue = create_workqueue("kasan_workqueue_test");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue);
+
+ work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work);
+
+ INIT_WORK(work, workqueue_uaf_work);
+ queue_work(workqueue, work);
+ destroy_workqueue(workqueue);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ((volatile struct work_struct *)work)->data);
+}
+
+static void vmalloc_helpers_tags(struct kunit *test)
+{
+ void *ptr;
+
+ /* This test is intended for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ ptr = vmalloc(PAGE_SIZE);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Check that the returned pointer is tagged. */
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure exported vmalloc helpers handle tagged pointers. */
+ KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr));
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr));
+
+#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST)
+ {
+ int rv;
+
+ /* Make sure vmalloc'ed memory permissions can be changed. */
+ rv = set_memory_ro((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ rv = set_memory_rw((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ }
+#endif
+
+ vfree(ptr);
+}
+
+static void vmalloc_oob(struct kunit *test)
+{
+ char *v_ptr, *p_ptr;
+ struct page *page;
+ size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ v_ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ OPTIMIZER_HIDE_VAR(v_ptr);
+
+ /*
+ * We have to be careful not to hit the guard page in vmalloc tests.
+ * The MMU will catch that and crash us.
+ */
+
+ /* Make sure in-bounds accesses are valid. */
+ v_ptr[0] = 0;
+ v_ptr[size - 1] = 0;
+
+ /*
+ * An unaligned access past the requested vmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+
+ /* An aligned access into the first out-of-bounds granule. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+
+ /* Check that in-bounds accesses to the physical page are valid. */
+ page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+ p_ptr[0] = 0;
+
+ vfree(v_ptr);
+
+ /*
+ * We can't check for use-after-unmap bugs in this nor in the following
+ * vmalloc tests, as the page might be fully unmapped and accessing it
+ * will crash the kernel.
+ */
+}
+
+static void vmap_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *p_page, *v_page;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vmap mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ p_page = alloc_pages(GFP_KERNEL, 1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page);
+ p_ptr = page_address(p_page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ /*
+ * We can't check for out-of-bounds bugs in this nor in the following
+ * vmalloc tests, as allocations have page granularity and accessing
+ * the guard page will crash the kernel.
+ */
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ /* Make sure vmalloc_to_page() correctly recovers the page pointer. */
+ v_page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page);
+ KUNIT_EXPECT_PTR_EQ(test, p_page, v_page);
+
+ vunmap(v_ptr);
+ free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vm_map_ram_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *page;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vm_map_ram mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ page = alloc_pages(GFP_KERNEL, 1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vm_map_ram(&page, 1, -1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ vm_unmap_ram(v_ptr, 1);
+ free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vmalloc_percpu(struct kunit *test)
+{
+ char __percpu *ptr;
+ int cpu;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons percpu mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ char *c_ptr = per_cpu_ptr(ptr, cpu);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses don't crash the kernel. */
+ *c_ptr = 0;
+ }
+
+ free_percpu(ptr);
+}
+
+/*
+ * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN,
+ * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based
+ * modes.
+ */
+static void match_all_not_assigned(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ int i, size, order;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ for (i = 0; i < 256; i++) {
+ size = get_random_u32_inclusive(1, 1024);
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ kfree(ptr);
+ }
+
+ for (i = 0; i < 256; i++) {
+ order = get_random_u32_inclusive(1, 4);
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ free_pages((unsigned long)ptr, order);
+ }
+
+ if (!IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ return;
+
+ for (i = 0; i < 256; i++) {
+ size = get_random_u32_inclusive(1, 1024);
+ ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ vfree(ptr);
+ }
+}
+
+/* Check that 0xff works as a match-all pointer tag for tag-based modes. */
+static void match_all_ptr_tag(struct kunit *test)
+{
+ char *ptr;
+ u8 tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Backup the assigned tag. */
+ tag = get_tag(ptr);
+ KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL);
+
+ /* Reset the tag to 0xff.*/
+ ptr = set_tag(ptr, KASAN_TAG_KERNEL);
+
+ /* This access shouldn't trigger a KASAN report. */
+ *ptr = 0;
+
+ /* Recover the pointer tag and free. */
+ ptr = set_tag(ptr, tag);
+ kfree(ptr);
+}
+
+/* Check that there are no match-all memory tags for tag-based modes. */
+static void match_all_mem_tag(struct kunit *test)
+{
+ char *ptr;
+ int tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* For each possible tag value not matching the pointer tag. */
+ for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) {
+ if (tag == get_tag(ptr))
+ continue;
+
+ /* Mark the first memory granule with the chosen memory tag. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag, false);
+
+ /* This access must cause a KASAN report. */
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0);
+ }
+
+ /* Recover the memory tag and free. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr), false);
+ kfree(ptr);
+}
+
+static struct kunit_case kasan_kunit_test_cases[] = {
+ KUNIT_CASE(kmalloc_oob_right),
+ KUNIT_CASE(kmalloc_oob_left),
+ KUNIT_CASE(kmalloc_node_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_uaf),
+ KUNIT_CASE(kmalloc_pagealloc_invalid_free),
+ KUNIT_CASE(pagealloc_oob_right),
+ KUNIT_CASE(pagealloc_uaf),
+ KUNIT_CASE(kmalloc_large_oob_right),
+ KUNIT_CASE(krealloc_more_oob),
+ KUNIT_CASE(krealloc_less_oob),
+ KUNIT_CASE(krealloc_pagealloc_more_oob),
+ KUNIT_CASE(krealloc_pagealloc_less_oob),
+ KUNIT_CASE(krealloc_uaf),
+ KUNIT_CASE(kmalloc_oob_16),
+ KUNIT_CASE(kmalloc_uaf_16),
+ KUNIT_CASE(kmalloc_oob_in_memset),
+ KUNIT_CASE(kmalloc_oob_memset_2),
+ KUNIT_CASE(kmalloc_oob_memset_4),
+ KUNIT_CASE(kmalloc_oob_memset_8),
+ KUNIT_CASE(kmalloc_oob_memset_16),
+ KUNIT_CASE(kmalloc_memmove_negative_size),
+ KUNIT_CASE(kmalloc_memmove_invalid_size),
+ KUNIT_CASE(kmalloc_uaf),
+ KUNIT_CASE(kmalloc_uaf_memset),
+ KUNIT_CASE(kmalloc_uaf2),
+ KUNIT_CASE(kmalloc_uaf3),
+ KUNIT_CASE(kfree_via_page),
+ KUNIT_CASE(kfree_via_phys),
+ KUNIT_CASE(kmem_cache_oob),
+ KUNIT_CASE(kmem_cache_accounted),
+ KUNIT_CASE(kmem_cache_bulk),
+ KUNIT_CASE(kasan_global_oob_right),
+ KUNIT_CASE(kasan_global_oob_left),
+ KUNIT_CASE(kasan_stack_oob),
+ KUNIT_CASE(kasan_alloca_oob_left),
+ KUNIT_CASE(kasan_alloca_oob_right),
+ KUNIT_CASE(ksize_unpoisons_memory),
+ KUNIT_CASE(ksize_uaf),
+ KUNIT_CASE(kmem_cache_double_free),
+ KUNIT_CASE(kmem_cache_invalid_free),
+ KUNIT_CASE(kmem_cache_double_destroy),
+ KUNIT_CASE(kasan_memchr),
+ KUNIT_CASE(kasan_memcmp),
+ KUNIT_CASE(kasan_strings),
+ KUNIT_CASE(kasan_bitops_generic),
+ KUNIT_CASE(kasan_bitops_tags),
+ KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(rcu_uaf),
+ KUNIT_CASE(workqueue_uaf),
+ KUNIT_CASE(vmalloc_helpers_tags),
+ KUNIT_CASE(vmalloc_oob),
+ KUNIT_CASE(vmap_tags),
+ KUNIT_CASE(vm_map_ram_tags),
+ KUNIT_CASE(vmalloc_percpu),
+ KUNIT_CASE(match_all_not_assigned),
+ KUNIT_CASE(match_all_ptr_tag),
+ KUNIT_CASE(match_all_mem_tag),
+ {}
+};
+
+static struct kunit_suite kasan_kunit_test_suite = {
+ .name = "kasan",
+ .test_cases = kasan_kunit_test_cases,
+ .exit = kasan_test_exit,
+ .suite_init = kasan_suite_init,
+ .suite_exit = kasan_suite_exit,
+};
+
+kunit_test_suite(kasan_kunit_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
new file mode 100644
index 000000000000..7be7bed456ef
--- /dev/null
+++ b/mm/kasan/kasan_test_module.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "kasan.h"
+
+static noinline void __init copy_user_test(void)
+{
+ char *kmem;
+ char __user *usermem;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+ int __maybe_unused unused;
+
+ kmem = kmalloc(size, GFP_KERNEL);
+ if (!kmem)
+ return;
+
+ usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (IS_ERR(usermem)) {
+ pr_err("Failed to allocate user memory\n");
+ kfree(kmem);
+ return;
+ }
+
+ OPTIMIZER_HIDE_VAR(size);
+
+ pr_info("out-of-bounds in copy_from_user()\n");
+ unused = copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in copy_to_user()\n");
+ unused = copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user()\n");
+ unused = __copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user()\n");
+ unused = __copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in strncpy_from_user()\n");
+ unused = strncpy_from_user(kmem, usermem, size + 1);
+
+ vm_munmap((unsigned long)usermem, PAGE_SIZE);
+ kfree(kmem);
+}
+
+static int __init test_kasan_module_init(void)
+{
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
+ bool multishot = kasan_save_enable_multi_shot();
+
+ copy_user_test();
+
+ kasan_restore_multi_shot(multishot);
+ return -EAGAIN;
+}
+
+module_init(test_kasan_module_init);
+MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index fe3f606b3a98..1d02757e90a3 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -9,6 +9,7 @@
* Andrey Konovalov <andreyknvl@gmail.com>
*/
+#include <kunit/test.h>
#include <linux/bitops.h>
#include <linux/ftrace.h>
#include <linux/init.h>
@@ -30,8 +31,6 @@
#include <asm/sections.h>
-#include <kunit/test.h>
-
#include "kasan.h"
#include "../slab.h"
@@ -115,40 +114,63 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
#endif
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-static void update_kunit_status(bool sync)
+
+/*
+ * Whether the KASAN KUnit test suite is currently being executed.
+ * Updated in kasan_test.c.
+ */
+bool kasan_kunit_executing;
+
+void kasan_kunit_test_suite_start(void)
+{
+ WRITE_ONCE(kasan_kunit_executing, true);
+}
+EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start);
+
+void kasan_kunit_test_suite_end(void)
+{
+ WRITE_ONCE(kasan_kunit_executing, false);
+}
+EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end);
+
+static bool kasan_kunit_test_suite_executing(void)
+{
+ return READ_ONCE(kasan_kunit_executing);
+}
+
+#else /* CONFIG_KASAN_KUNIT_TEST */
+
+static inline bool kasan_kunit_test_suite_executing(void) { return false; }
+
+#endif /* CONFIG_KASAN_KUNIT_TEST */
+
+#if IS_ENABLED(CONFIG_KUNIT)
+
+static void fail_non_kasan_kunit_test(void)
{
struct kunit *test;
- struct kunit_resource *resource;
- struct kunit_kasan_status *status;
- test = current->kunit_test;
- if (!test)
+ if (kasan_kunit_test_suite_executing())
return;
- resource = kunit_find_named_resource(test, "kasan_status");
- if (!resource) {
+ test = current->kunit_test;
+ if (test)
kunit_set_failure(test);
- return;
- }
+}
- status = (struct kunit_kasan_status *)resource->data;
- WRITE_ONCE(status->report_found, true);
- WRITE_ONCE(status->sync_fault, sync);
+#else /* CONFIG_KUNIT */
- kunit_put_resource(resource);
-}
-#else
-static void update_kunit_status(bool sync) { }
-#endif
+static inline void fail_non_kasan_kunit_test(void) { }
+
+#endif /* CONFIG_KUNIT */
static DEFINE_SPINLOCK(report_lock);
static void start_report(unsigned long *flags, bool sync)
{
+ fail_non_kasan_kunit_test();
/* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
disable_trace_on_warning();
- /* Update status of the currently running KASAN test. */
- update_kunit_status(sync);
/* Do not allow LOCKDEP mangling KASAN reports. */
lockdep_off();
/* Make sure we don't end up in loop. */
@@ -164,8 +186,8 @@ static void end_report(unsigned long *flags, void *addr)
(unsigned long)addr);
pr_err("==================================================================\n");
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
- panic("panic_on_warn set ...\n");
+ if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ check_panic_on_warn("KASAN");
if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
panic("kasan.fault=panic set ...\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -175,18 +197,14 @@ static void end_report(unsigned long *flags, void *addr)
static void print_error_description(struct kasan_report_info *info)
{
- if (info->type == KASAN_REPORT_INVALID_FREE) {
- pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip);
- return;
- }
+ pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip);
- if (info->type == KASAN_REPORT_DOUBLE_FREE) {
- pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip);
+ if (info->type != KASAN_REPORT_ACCESS) {
+ pr_err("Free of addr %px by task %s/%d\n",
+ info->access_addr, current->comm, task_pid_nr(current));
return;
}
- pr_err("BUG: KASAN: %s in %pS\n",
- kasan_get_bug_type(info), (void *)info->ip);
if (info->access_size)
pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
@@ -200,31 +218,21 @@ static void print_error_description(struct kasan_report_info *info)
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
- if (track->stack) {
+ if (track->stack)
stack_depot_print(track->stack);
- } else {
+ else
pr_err("(stack is not available)\n");
- }
}
-struct page *kasan_addr_to_page(const void *addr)
+static inline struct page *addr_to_page(const void *addr)
{
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory))
+ if (virt_addr_valid(addr))
return virt_to_head_page(addr);
return NULL;
}
-struct slab *kasan_addr_to_slab(const void *addr)
-{
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory))
- return virt_to_slab(addr);
- return NULL;
-}
-
-static void describe_object_addr(struct kmem_cache *cache, void *object,
- const void *addr)
+static void describe_object_addr(const void *addr, struct kmem_cache *cache,
+ void *object)
{
unsigned long access_addr = (unsigned long)addr;
unsigned long object_addr = (unsigned long)object;
@@ -252,46 +260,26 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static void describe_object_stacks(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object_stacks(struct kasan_report_info *info)
{
- struct kasan_alloc_meta *alloc_meta;
- struct kasan_track *free_track;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta) {
- print_track(&alloc_meta->alloc_track, "Allocated");
+ if (info->alloc_track.stack) {
+ print_track(&info->alloc_track, "Allocated");
pr_err("\n");
}
- free_track = kasan_get_free_track(cache, object, tag);
- if (free_track) {
- print_track(free_track, "Freed");
+ if (info->free_track.stack) {
+ print_track(&info->free_track, "Freed");
pr_err("\n");
}
-#ifdef CONFIG_KASAN_GENERIC
- if (!alloc_meta)
- return;
- if (alloc_meta->aux_stack[0]) {
- pr_err("Last potentially related work creation:\n");
- stack_depot_print(alloc_meta->aux_stack[0]);
- pr_err("\n");
- }
- if (alloc_meta->aux_stack[1]) {
- pr_err("Second to last potentially related work creation:\n");
- stack_depot_print(alloc_meta->aux_stack[1]);
- pr_err("\n");
- }
-#endif
+ kasan_print_aux_stacks(info->cache, info->object);
}
-static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object(const void *addr, struct kasan_report_info *info)
{
if (kasan_stack_collection_enabled())
- describe_object_stacks(cache, object, addr, tag);
- describe_object_addr(cache, object, addr);
+ describe_object_stacks(info);
+ describe_object_addr(addr, info->cache, info->object);
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -310,19 +298,16 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-static void print_address_description(void *addr, u8 tag)
+static void print_address_description(void *addr, u8 tag,
+ struct kasan_report_info *info)
{
- struct page *page = kasan_addr_to_page(addr);
+ struct page *page = addr_to_page(addr);
dump_stack_lvl(KERN_ERR);
pr_err("\n");
- if (page && PageSlab(page)) {
- struct slab *slab = page_slab(page);
- struct kmem_cache *cache = slab->slab_cache;
- void *object = nearest_obj(cache, slab, addr);
-
- describe_object(cache, object, addr, tag);
+ if (info->cache && info->object) {
+ describe_object(addr, info);
pr_err("\n");
}
@@ -420,23 +405,56 @@ static void print_memory_metadata(const void *addr)
static void print_report(struct kasan_report_info *info)
{
- void *tagged_addr = info->access_addr;
- void *untagged_addr = kasan_reset_tag(tagged_addr);
- u8 tag = get_tag(tagged_addr);
+ void *addr = kasan_reset_tag(info->access_addr);
+ u8 tag = get_tag(info->access_addr);
print_error_description(info);
- if (addr_has_metadata(untagged_addr))
+ if (addr_has_metadata(addr))
kasan_print_tags(tag, info->first_bad_addr);
pr_err("\n");
- if (addr_has_metadata(untagged_addr)) {
- print_address_description(untagged_addr, tag);
+ if (addr_has_metadata(addr)) {
+ print_address_description(addr, tag, info);
print_memory_metadata(info->first_bad_addr);
} else {
dump_stack_lvl(KERN_ERR);
}
}
+static void complete_report_info(struct kasan_report_info *info)
+{
+ void *addr = kasan_reset_tag(info->access_addr);
+ struct slab *slab;
+
+ if (info->type == KASAN_REPORT_ACCESS)
+ info->first_bad_addr = kasan_find_first_bad_addr(
+ info->access_addr, info->access_size);
+ else
+ info->first_bad_addr = addr;
+
+ slab = kasan_addr_to_slab(addr);
+ if (slab) {
+ info->cache = slab->slab_cache;
+ info->object = nearest_obj(info->cache, slab, addr);
+ } else
+ info->cache = info->object = NULL;
+
+ switch (info->type) {
+ case KASAN_REPORT_INVALID_FREE:
+ info->bug_type = "invalid-free";
+ break;
+ case KASAN_REPORT_DOUBLE_FREE:
+ info->bug_type = "double-free";
+ break;
+ default:
+ /* bug_type filled in by kasan_complete_mode_report_info. */
+ break;
+ }
+
+ /* Fill in mode-specific report info fields. */
+ kasan_complete_mode_report_info(info);
+}
+
void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type)
{
unsigned long flags;
@@ -452,13 +470,15 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
start_report(&flags, true);
+ memset(&info, 0, sizeof(info));
info.type = type;
info.access_addr = ptr;
- info.first_bad_addr = kasan_reset_tag(ptr);
info.access_size = 0;
info.is_write = false;
info.ip = ip;
+ complete_report_info(&info);
+
print_report(&info);
end_report(&flags, ptr);
@@ -485,13 +505,15 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
start_report(&irq_flags, true);
+ memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
info.access_addr = ptr;
- info.first_bad_addr = kasan_find_first_bad_addr(ptr, size);
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
+ complete_report_info(&info);
+
print_report(&info);
end_report(&irq_flags, ptr);
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 6689fb9a919b..043c94b04605 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_report_info *info)
return bug_type;
}
-const char *kasan_get_bug_type(struct kasan_report_info *info)
+static const char *get_bug_type(struct kasan_report_info *info)
{
/*
* If access_size is a negative number, then it has reason to be
@@ -127,11 +127,55 @@ const char *kasan_get_bug_type(struct kasan_report_info *info)
return get_wild_bug_type(info);
}
+void kasan_complete_mode_report_info(struct kasan_report_info *info)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ struct kasan_free_meta *free_meta;
+
+ if (!info->bug_type)
+ info->bug_type = get_bug_type(info);
+
+ if (!info->cache || !info->object)
+ return;
+
+ alloc_meta = kasan_get_alloc_meta(info->cache, info->object);
+ if (alloc_meta)
+ memcpy(&info->alloc_track, &alloc_meta->alloc_track,
+ sizeof(info->alloc_track));
+
+ if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) {
+ /* Free meta must be present with KASAN_SLAB_FREETRACK. */
+ free_meta = kasan_get_free_meta(info->cache, info->object);
+ memcpy(&info->free_track, &free_meta->free_track,
+ sizeof(info->free_track));
+ }
+}
+
void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
+void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
+
+ if (alloc_meta->aux_stack[0]) {
+ pr_err("Last potentially related work creation:\n");
+ stack_depot_print(alloc_meta->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_meta->aux_stack[1]) {
+ pr_err("Second to last potentially related work creation:\n");
+ stack_depot_print(alloc_meta->aux_stack[1]);
+ pr_err("\n");
+ }
+}
+
#ifdef CONFIG_KASAN_STACK
static bool __must_check tokenize_frame_descr(const char **frame_descr,
char *token, size_t max_tok_len,
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
index e25d2166e813..ecede06ef374 100644
--- a/mm/kasan/report_tags.c
+++ b/mm/kasan/report_tags.c
@@ -4,38 +4,14 @@
* Copyright (c) 2020 Google, Inc.
*/
+#include <linux/atomic.h>
+
#include "kasan.h"
-#include "../slab.h"
-const char *kasan_get_bug_type(struct kasan_report_info *info)
-{
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- struct kasan_alloc_meta *alloc_meta;
- struct kmem_cache *cache;
- struct slab *slab;
- const void *addr;
- void *object;
- u8 tag;
- int i;
-
- tag = get_tag(info->access_addr);
- addr = kasan_reset_tag(info->access_addr);
- slab = kasan_addr_to_slab(addr);
- if (slab) {
- cache = slab->slab_cache;
- object = nearest_obj(cache, slab, (void *)addr);
- alloc_meta = kasan_get_alloc_meta(cache, object);
-
- if (alloc_meta) {
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- return "use-after-free";
- }
- }
- return "out-of-bounds";
- }
-#endif
+extern struct kasan_stack_ring stack_ring;
+static const char *get_common_bug_type(struct kasan_report_info *info)
+{
/*
* If access_size is a negative number, then it has reason to be
* defined as out-of-bounds bug type.
@@ -49,3 +25,92 @@ const char *kasan_get_bug_type(struct kasan_report_info *info)
return "invalid-access";
}
+
+void kasan_complete_mode_report_info(struct kasan_report_info *info)
+{
+ unsigned long flags;
+ u64 pos;
+ struct kasan_stack_ring_entry *entry;
+ void *ptr;
+ u32 pid;
+ depot_stack_handle_t stack;
+ bool is_free;
+ bool alloc_found = false, free_found = false;
+
+ if ((!info->cache || !info->object) && !info->bug_type) {
+ info->bug_type = get_common_bug_type(info);
+ return;
+ }
+
+ write_lock_irqsave(&stack_ring.lock, flags);
+
+ pos = atomic64_read(&stack_ring.pos);
+
+ /*
+ * The loop below tries to find stack ring entries relevant to the
+ * buggy object. This is a best-effort process.
+ *
+ * First, another object with the same tag can be allocated in place of
+ * the buggy object. Also, since the number of entries is limited, the
+ * entries relevant to the buggy object can be overwritten.
+ */
+
+ for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) {
+ if (alloc_found && free_found)
+ break;
+
+ entry = &stack_ring.entries[i % stack_ring.size];
+
+ /* Paired with smp_store_release() in save_stack_info(). */
+ ptr = (void *)smp_load_acquire(&entry->ptr);
+
+ if (kasan_reset_tag(ptr) != info->object ||
+ get_tag(ptr) != get_tag(info->access_addr))
+ continue;
+
+ pid = READ_ONCE(entry->pid);
+ stack = READ_ONCE(entry->stack);
+ is_free = READ_ONCE(entry->is_free);
+
+ if (is_free) {
+ /*
+ * Second free of the same object.
+ * Give up on trying to find the alloc entry.
+ */
+ if (free_found)
+ break;
+
+ info->free_track.pid = pid;
+ info->free_track.stack = stack;
+ free_found = true;
+
+ /*
+ * If a free entry is found first, the bug is likely
+ * a use-after-free.
+ */
+ if (!info->bug_type)
+ info->bug_type = "use-after-free";
+ } else {
+ /* Second alloc of the same object. Give up. */
+ if (alloc_found)
+ break;
+
+ info->alloc_track.pid = pid;
+ info->alloc_track.stack = stack;
+ alloc_found = true;
+
+ /*
+ * If an alloc entry is found first, the bug is likely
+ * an out-of-bounds.
+ */
+ if (!info->bug_type)
+ info->bug_type = "slab-out-of-bounds";
+ }
+ }
+
+ write_unlock_irqrestore(&stack_ring.lock, flags);
+
+ /* Assign the common bug type if no entries were found. */
+ if (!info->bug_type)
+ info->bug_type = get_common_bug_type(info);
+}
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 0e3648b603a6..2fba1f51f042 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
static int __init kasan_memhotplug_init(void)
{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
+ hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 77f13f391b57..a3afaf2ad1b1 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void)
for_each_possible_cpu(cpu)
per_cpu(prng_state, cpu) = (u32)get_cycles();
- pr_info("KernelAddressSanitizer initialized (sw-tags)\n");
+ kasan_init_tags();
+
+ pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
+ kasan_stack_collection_enabled() ? "on" : "off");
}
/*
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 8f48b9502a17..67a222586846 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -6,9 +6,11 @@
* Copyright (c) 2020 Google, Inc.
*/
+#include <linux/atomic.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
+#include <linux/memblock.h>
#include <linux/memory.h>
#include <linux/mm.h>
#include <linux/static_key.h>
@@ -16,44 +18,127 @@
#include <linux/types.h>
#include "kasan.h"
+#include "../slab.h"
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
+#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10)
+
+enum kasan_arg_stacktrace {
+ KASAN_ARG_STACKTRACE_DEFAULT,
+ KASAN_ARG_STACKTRACE_OFF,
+ KASAN_ARG_STACKTRACE_ON,
+};
+
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
+
+/* Whether to collect alloc/free stack traces. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return;
+/* Non-zero, as initial pointer values are 0. */
+#define STACK_RING_BUSY_PTR ((void *)1)
+
+struct kasan_stack_ring stack_ring = {
+ .lock = __RW_LOCK_UNLOCKED(stack_ring.lock)
+};
+
+/* kasan.stacktrace=off/on */
+static int __init early_kasan_flag_stacktrace(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
+ if (!strcmp(arg, "off"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
+ else
+ return -EINVAL;
- kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+ return 0;
}
+early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
+/* kasan.stack_ring_size=<number of entries> */
+static int __init early_kasan_flag_stack_ring_size(char *arg)
{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
+ if (!arg)
+ return -EINVAL;
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return NULL;
+ return kstrtoul(arg, 0, &stack_ring.size);
+}
+early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size);
+
+void __init kasan_init_tags(void)
+{
+ switch (kasan_arg_stacktrace) {
+ case KASAN_ARG_STACKTRACE_DEFAULT:
+ /* Default is specified by kasan_flag_stacktrace definition. */
+ break;
+ case KASAN_ARG_STACKTRACE_OFF:
+ static_branch_disable(&kasan_flag_stacktrace);
+ break;
+ case KASAN_ARG_STACKTRACE_ON:
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
+ if (kasan_stack_collection_enabled()) {
+ if (!stack_ring.size)
+ stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT;
+ stack_ring.entries = memblock_alloc(
+ sizeof(stack_ring.entries[0]) * stack_ring.size,
+ SMP_CACHE_BYTES);
+ if (WARN_ON(!stack_ring.entries))
+ static_branch_disable(&kasan_flag_stacktrace);
}
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
+}
+
+static void save_stack_info(struct kmem_cache *cache, void *object,
+ gfp_t gfp_flags, bool is_free)
+{
+ unsigned long flags;
+ depot_stack_handle_t stack;
+ u64 pos;
+ struct kasan_stack_ring_entry *entry;
+ void *old_ptr;
+
+ stack = kasan_save_stack(gfp_flags, true);
+
+ /*
+ * Prevent save_stack_info() from modifying stack ring
+ * when kasan_complete_mode_report_info() is walking it.
+ */
+ read_lock_irqsave(&stack_ring.lock, flags);
+
+next:
+ pos = atomic64_fetch_add(1, &stack_ring.pos);
+ entry = &stack_ring.entries[pos % stack_ring.size];
+
+ /* Detect stack ring entry slots that are being written to. */
+ old_ptr = READ_ONCE(entry->ptr);
+ if (old_ptr == STACK_RING_BUSY_PTR)
+ goto next; /* Busy slot. */
+ if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR))
+ goto next; /* Busy slot. */
+
+ WRITE_ONCE(entry->size, cache->object_size);
+ WRITE_ONCE(entry->pid, current->pid);
+ WRITE_ONCE(entry->stack, stack);
+ WRITE_ONCE(entry->is_free, is_free);
+
+ /*
+ * Paired with smp_load_acquire() in kasan_complete_mode_report_info().
+ */
+ smp_store_release(&entry->ptr, (s64)object);
- return &alloc_meta->free_track[i];
+ read_unlock_irqrestore(&stack_ring.lock, flags);
+}
+
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ save_stack_info(cache, object, flags, false);
+}
+
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
+{
+ save_stack_info(cache, object, GFP_NOWAIT, true);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index c252081b11df..5349c37a5dac 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -26,7 +26,6 @@
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
-#include <linux/sched/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -360,9 +359,9 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
unsigned long flags;
struct slab *slab;
void *addr;
- const bool random_right_allocate = prandom_u32_max(2);
+ const bool random_right_allocate = get_random_u32_below(2);
const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
- !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS);
+ !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS);
/* Try to obtain a free object. */
raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
@@ -719,24 +718,13 @@ static int show_object(struct seq_file *seq, void *v)
return 0;
}
-static const struct seq_operations object_seqops = {
+static const struct seq_operations objects_sops = {
.start = start_object,
.next = next_object,
.stop = stop_object,
.show = show_object,
};
-
-static int open_objects(struct inode *inode, struct file *file)
-{
- return seq_open(file, &object_seqops);
-}
-
-static const struct file_operations objects_fops = {
- .open = open_objects,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(objects);
static int __init kfence_debugfs_init(void)
{
@@ -810,16 +798,7 @@ static void toggle_allocation_gate(struct work_struct *work)
/* Enable static key, and await allocation to happen. */
static_branch_enable(&kfence_allocation_key);
- if (sysctl_hung_task_timeout_secs) {
- /*
- * During low activity with no allocations we might wait a
- * while; let's avoid the hung task warning.
- */
- wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
- sysctl_hung_task_timeout_secs * HZ / 2);
- } else {
- wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
- }
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
@@ -864,7 +843,7 @@ static void kfence_init_enable(void)
void __init kfence_init(void)
{
- stack_hash_seed = (u32)random_get_entropy();
+ stack_hash_seed = get_random_u32();
/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
if (!kfence_sample_interval)
@@ -1003,6 +982,13 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
return NULL;
}
+ /*
+ * Skip allocations for this slab, if KFENCE has been disabled for
+ * this slab.
+ */
+ if (s->flags & SLAB_SKIP_KFENCE)
+ return NULL;
+
if (atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index a97bffe0cc3e..b5d66a69200d 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -532,8 +532,8 @@ static void test_free_bulk(struct kunit *test)
int iter;
for (iter = 0; iter < 5; iter++) {
- const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0,
- (iter & 1) ? ctor_set_x : NULL);
+ const size_t size = setup_test_cache(test, get_random_u32_inclusive(8, 307),
+ 0, (iter & 1) ? ctor_set_x : NULL);
void *objects[] = {
test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT),
test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index f5a6d8ba3e21..60205f1257ef 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -75,15 +75,21 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") ||
!strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
/*
- * In case of tail calls from any of the below
- * to any of the above.
+ * In case of tail calls from any of the below to any of
+ * the above, optimized by the compiler such that the
+ * stack trace would omit the initial entry point below.
*/
fallback = skipnr + 1;
}
- /* Also the *_bulk() variants by only checking prefixes. */
+ /*
+ * The below list should only include the initial entry points
+ * into the slab allocators. Includes the *_bulk() variants by
+ * checking prefixes.
+ */
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
@@ -267,8 +273,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
lockdep_on();
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("KFENCE");
/* We encountered a memory safety error, taint the kernel! */
add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..5cb401aa2b9d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -23,16 +23,20 @@
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "mm_slot.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
SCAN_PMD_NULL,
+ SCAN_PMD_NONE,
+ SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
+ SCAN_PTE_MAPPED_HUGEPAGE,
SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
@@ -73,6 +77,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* default collapse hugepages if there is at least one pte mapped like
* it would have happened if the vma was large enough during page
* fault.
+ *
+ * Note that these are only respected if collapse was initiated by khugepaged.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
@@ -85,18 +91,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
#define MAX_PTE_MAPPED_THP 8
+struct collapse_control {
+ bool is_khugepaged;
+
+ /* Num pages scanned per node */
+ u32 node_load[MAX_NUMNODES];
+
+ /* nodemask for allocation fallback */
+ nodemask_t alloc_nmask;
+};
+
/**
- * struct mm_slot - hash lookup from mm to mm_slot
- * @hash: hash collision list
- * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
- * @mm: the mm that this information is valid for
+ * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
* @nr_pte_mapped_thp: number of pte mapped THP
* @pte_mapped_thp: address array corresponding pte mapped THP
*/
-struct mm_slot {
- struct hlist_node hash;
- struct list_head mm_node;
- struct mm_struct *mm;
+struct khugepaged_mm_slot {
+ struct mm_slot slot;
/* pte-mapped THP in this mm */
int nr_pte_mapped_thp;
@@ -113,7 +125,7 @@ struct mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
unsigned long address;
};
@@ -377,8 +389,9 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
- sizeof(struct mm_slot),
- __alignof__(struct mm_slot), 0, NULL);
+ sizeof(struct khugepaged_mm_slot),
+ __alignof__(struct khugepaged_mm_slot),
+ 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
@@ -395,65 +408,38 @@ void __init khugepaged_destroy(void)
kmem_cache_destroy(mm_slot_cache);
}
-static inline struct mm_slot *alloc_mm_slot(void)
-{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *mm_slot;
-
- hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
- if (mm == mm_slot->mm)
- return mm_slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
-}
-
-static inline int khugepaged_test_exit(struct mm_struct *mm)
+static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
void __khugepaged_enter(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
int wakeup;
- mm_slot = alloc_mm_slot();
+ mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return;
+ slot = &mm_slot->slot;
+
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
return;
}
spin_lock(&khugepaged_mm_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
+ mm_slot_insert(mm_slots_hash, mm, slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
- list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
mmgrab(mm);
@@ -466,37 +452,38 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_flags_enabled()) {
- if (hugepage_vma_check(vma, vm_flags, false, false))
+ if (hugepage_vma_check(vma, vm_flags, false, false, true))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
free = 1;
}
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_lock.
+ * hpage_collapse_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we return all
+ * pagetables will be destroyed) until khugepaged has finished
+ * working on the pagetables under the mmap_lock.
*/
mmap_write_lock(mm);
mmap_write_unlock(mm);
@@ -546,11 +533,12 @@ static bool is_refcount_suitable(struct page *page)
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
+ struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
@@ -558,8 +546,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -579,11 +569,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page);
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
- goto out;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out;
+ }
}
if (PageCompound(page)) {
@@ -646,10 +639,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (PageCompound(page))
list_add_tail(&page->lru, compound_pagelist);
next:
- /* There should be enough young pte to collapse the page */
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
if (pte_write(pteval))
@@ -658,19 +655,19 @@ next:
if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
- } else if (unlikely(!referenced)) {
+ } else if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 1;
+ return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 0;
+ return result;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -730,14 +727,16 @@ static void khugepaged_alloc_sleep(void)
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
- freezable_schedule_timeout_interruptible(
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
-static int khugepaged_node_load[MAX_NUMNODES];
+struct collapse_control khugepaged_collapse_control = {
+ .is_khugepaged = true,
+};
-static bool khugepaged_scan_abort(int nid)
+static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -749,11 +748,11 @@ static bool khugepaged_scan_abort(int nid)
return false;
/* If there is a count for this node already, it must be acceptable */
- if (khugepaged_node_load[nid])
+ if (cc->node_load[nid])
return false;
for (i = 0; i < MAX_NUMNODES; i++) {
- if (!khugepaged_node_load[i])
+ if (!cc->node_load[i])
continue;
if (node_distance(nid, i) > node_reclaim_distance)
return true;
@@ -772,146 +771,59 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
}
#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
+ if (cc->node_load[nid] > max_value) {
+ max_value = cc->node_load[nid];
target_node = nid;
}
- /* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
- target_node = nid;
- break;
- }
+ for_each_online_node(nid) {
+ if (max_value == cc->node_load[nid])
+ node_set(nid, cc->alloc_nmask);
+ }
- last_khugepaged_target_node = target_node;
return target_node;
}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+#else
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- if (IS_ERR(*hpage)) {
- if (!*wait)
- return false;
-
- *wait = false;
- *hpage = NULL;
- khugepaged_alloc_sleep();
- } else if (*hpage) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- return true;
+ return 0;
}
+#endif
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+ nodemask_t *nmask)
{
- VM_BUG_ON_PAGE(*hpage, *hpage);
-
- *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+ *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- *hpage = ERR_PTR(-ENOMEM);
- return NULL;
+ return false;
}
prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
- return *hpage;
-}
-#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
- struct page *page;
-
- page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
- HPAGE_PMD_ORDER);
- if (page)
- prep_transhuge_page(page);
- return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
- struct page *hpage;
-
- do {
- hpage = alloc_khugepaged_hugepage();
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- if (!*wait)
- return NULL;
-
- *wait = false;
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
-
- return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
- /*
- * If the hpage allocated earlier was briefly exposed in page cache
- * before collapse_file() failed, it is possible that racing lookups
- * have not yet completed, and would then be unpleasantly surprised by
- * finding the hpage reused for the same mapping at a different offset.
- * Just release the previous allocation if there is any danger of that.
- */
- if (*hpage && page_count(*hpage) > 1) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- if (!*hpage)
- *hpage = khugepaged_alloc_hugepage(wait);
-
- if (unlikely(!*hpage))
- return false;
-
return true;
}
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
- VM_BUG_ON(!*hpage);
-
- return *hpage;
-}
-#endif
-
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
- * Return 0 if succeeds, otherwise return none-zero
- * value (scan code).
+ * Returns enum scan_result value.
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- struct vm_area_struct **vmap)
+ bool expect_anon,
+ struct vm_area_struct **vmap,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -920,7 +832,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
+ cc->is_khugepaged))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -929,23 +842,62 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
* hugepage_vma_check may return true for qualified file
* vmas.
*/
- if (!vma->anon_vma || !vma_is_anonymous(vma))
- return SCAN_VMA_CHECK;
- return 0;
+ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
+ return SCAN_PAGE_ANON;
+ return SCAN_SUCCEED;
+}
+
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ pmd_t pmde;
+
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ pmde = pmdp_get_lockless(*pmd);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
+ barrier();
+#endif
+ if (pmd_none(pmde))
+ return SCAN_PMD_NONE;
+ if (pmd_trans_huge(pmde))
+ return SCAN_PMD_MAPPED;
+ if (pmd_bad(pmde))
+ return SCAN_PMD_NULL;
+ return SCAN_SUCCEED;
+}
+
+static int check_pmd_still_valid(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmd)
+{
+ pmd_t *new_pmd;
+ int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
+
+ if (result != SCAN_SUCCEED)
+ return result;
+ if (new_pmd != pmd)
+ return SCAN_FAIL;
+ return SCAN_SUCCEED;
}
/*
* Bring missing pages in from swap, to complete THP collapse.
- * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
* Note that if false is returned, mmap_lock will be released.
*/
-static bool __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
- int referenced)
+static int __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
@@ -976,12 +928,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
*/
if (ret & VM_FAULT_RETRY) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ /* Likely, but not guaranteed, that page lock failed */
+ return SCAN_PAGE_LOCK;
}
if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ return SCAN_FAIL;
}
swapped_in++;
}
@@ -991,30 +944,40 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
lru_add_drain();
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
- return true;
+ return SCAN_SUCCEED;
}
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- int node, int referenced, int unmapped)
+static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
+ struct collapse_control *cc)
+{
+ gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
+ GFP_TRANSHUGE);
+ int node = hpage_collapse_find_target_node(cc);
+
+ if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+ return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+ return SCAN_CGROUP_CHARGE_FAIL;
+ count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+ return SCAN_SUCCEED;
+}
+
+static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped,
+ struct collapse_control *cc)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
- struct page *new_page;
+ struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated = 0, result = 0;
+ int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
- gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
@@ -1022,40 +985,34 @@ static void collapse_huge_page(struct mm_struct *mm,
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out_nolock;
- }
- if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out_nolock;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result) {
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- /*
- * __collapse_huge_page_swapin will return with mmap_lock released
- * when it fails. So we jump out_nolock directly in that case.
- * Continuing to collapse causes inconsistency.
- */
- if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
- pmd, referenced)) {
- goto out_nolock;
+ if (unmapped) {
+ /*
+ * __collapse_huge_page_swapin will return with mmap_lock
+ * released when it fails. So we jump out_nolock directly in
+ * that case. Continuing to collapse causes inconsistency.
+ */
+ result = __collapse_huge_page_swapin(mm, vma, address, pmd,
+ referenced);
+ if (result != SCAN_SUCCEED)
+ goto out_nolock;
}
mmap_read_unlock(mm);
@@ -1065,11 +1022,12 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result)
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd)
+ result = check_pmd_still_valid(mm, address, pmd);
+ if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
@@ -1083,21 +1041,24 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ &compound_pagelist);
spin_unlock(pte_ptl);
- if (unlikely(!isolated)) {
+ if (unlikely(result != SCAN_SUCCEED)) {
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
@@ -1109,7 +1070,6 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
- result = SCAN_FAIL;
goto out_up_write;
}
@@ -1119,8 +1079,8 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
- &compound_pagelist);
+ __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
/*
* spin_lock() below is not the equivalent of smp_wmb(), but
@@ -1128,42 +1088,43 @@ static void collapse_huge_page(struct mm_struct *mm,
* avoid the copy_huge_page writes to become visible after
* the set_pmd_at() write.
*/
- __SetPageUptodate(new_page);
+ __SetPageUptodate(hpage);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ page_add_new_anon_rmap(hpage, vma, address);
+ lru_cache_add_inactive_or_unevictable(hpage, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
- *hpage = NULL;
+ hpage = NULL;
- khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(page_folio(*hpage));
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
+ if (hpage) {
+ mem_cgroup_uncharge(page_folio(hpage));
+ put_page(hpage);
+ }
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ return result;
}
-static int khugepaged_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- struct page **hpage)
+static int hpage_collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, result = 0, referenced = 0;
+ int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
@@ -1173,19 +1134,20 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED)
goto out;
- }
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
- if (++unmapped <= khugepaged_max_ptes_swap) {
+ ++unmapped;
+ if (!cc->is_khugepaged ||
+ unmapped <= khugepaged_max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
@@ -1203,8 +1165,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -1234,27 +1198,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
- goto out_unmap;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out_unmap;
+ }
}
page = compound_head(page);
/*
* Record which node the original page is from and save this
- * information to khugepaged_node_load[].
+ * information to cc->node_load[].
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
@@ -1271,15 +1238,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
/*
* Check if the page has any GUP (or other external) pins.
*
- * Here the check is racy it may see total_mapcount > refcount
- * in some cases.
- * For example, one process with one forked child process.
- * The parent has the PMD split due to MADV_DONTNEED, then
- * the child is trying unmap the whole PMD, but khugepaged
- * may be scanning the parent between the child has
- * PageDoubleMap flag cleared and dec the mapcount. So
- * khugepaged may see total_mapcount > refcount.
- *
+ * Here the check may be racy:
+ * it may see total_mapcount > refcount in some cases?
* But such case is ephemeral we could always retry collapse
* later. However it may report false positive if the page
* has excessive GUP pins (i.e. 512). Anyway the same check
@@ -1289,43 +1249,51 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
}
if (!writable) {
result = SCAN_PAGE_RO;
- } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ } else if (cc->is_khugepaged &&
+ (!referenced ||
+ (unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
+ if (result == SCAN_SUCCEED) {
+ result = collapse_huge_page(mm, address, referenced,
+ unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
- collapse_huge_page(mm, address, hpage, node,
- referenced, unmapped);
+ *mmap_locked = false;
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
none_or_zero, result, unmapped);
- return ret;
+ return result;
}
-static void collect_mm_slot(struct mm_slot *mm_slot)
+static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
{
- struct mm_struct *mm = mm_slot->mm;
+ struct mm_slot *slot = &mm_slot->slot;
+ struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
- if (khugepaged_test_exit(mm)) {
+ if (hpage_collapse_test_exit(mm)) {
/* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
/*
* Not strictly needed because the mm exited already.
@@ -1334,7 +1302,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
*/
/* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
}
}
@@ -1343,31 +1311,105 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
/*
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
+ *
+ * Note that following race exists:
+ * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
+ * emptying the A's ->pte_mapped_thp[] array.
+ * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
+ * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
+ * (at virtual address X) and adds an entry (for X) into mm_struct A's
+ * ->pte-mapped_thp[] array.
+ * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
+ * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
+ * (for X) into mm_struct A's ->pte-mapped_thp[] array.
+ * Thus, it's possible the same address is added multiple times for the same
+ * mm_struct. Should this happen, we'll simply attempt
+ * collapse_pte_mapped_thp() multiple times for the same address, under the same
+ * exclusive mmap_lock, and assuming the first call is successful, subsequent
+ * attempts will return quickly (without grabbing any additional locks) when
+ * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
+ * check, and since this is a rare occurrence, the cost of preventing this
+ * "multiple-add" is thought to be more expensive than just handling it, should
+ * it occur.
*/
-static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
+ bool ret = false;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ ret = true;
+ }
spin_unlock(&khugepaged_mm_lock);
+ return ret;
}
+/* hpage must be locked, and mmap_lock must be held in write */
+static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct page *hpage)
+{
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = addr,
+ .flags = 0,
+ .pmd = pmdp,
+ };
+
+ VM_BUG_ON(!PageTransHuge(hpage));
+ mmap_assert_write_locked(vma->vm_mm);
+
+ if (do_set_pmd(&vmf, hpage))
+ return SCAN_FAIL;
+
+ get_page(hpage);
+ return SCAN_SUCCEED;
+}
+
+/*
+ * A note about locking:
+ * Trying to take the page table spinlocks would be useless here because those
+ * are only used to synchronize:
+ *
+ * - modifying terminal entries (ones that point to a data page, not to another
+ * page table)
+ * - installing *new* non-terminal entries
+ *
+ * Instead, we need roughly the same kind of protection as free_pgtables() or
+ * mm_take_all_locks() (but only for a single VMA):
+ * The mmap lock together with this VMA's rmap locks covers all paths towards
+ * the page table entries we're messing with here, except for hardware page
+ * table walks and lockless_pages_from_mm().
+ */
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- spinlock_t *ptl;
pmd_t pmd;
+ struct mmu_notifier_range range;
mmap_assert_write_locked(mm);
- ptl = pmd_lock(vma->vm_mm, pmdp);
+ if (vma->vm_file)
+ lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
+ /*
+ * All anon_vmas attached to the VMA have the same root and are
+ * therefore locked by the same lock.
+ */
+ if (vma->anon_vma)
+ lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
pmd = pmdp_collapse_flush(vma, addr, pmdp);
- spin_unlock(ptl);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
pte_free(mm, pmd_pgtable(pmd));
@@ -1379,52 +1421,102 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
*
* @mm: process address space where collapse happens
* @addr: THP collapse address
+ * @install_pmd: If a huge PMD should be installed
*
* This function checks whether all the PTEs in the PMD are pointing to the
* right THP. If so, retract the page table so the THP can refault in with
- * as pmd-mapped.
+ * as pmd-mapped. Possibly install a huge PMD mapping the THP.
*/
-void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
- struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd;
spinlock_t *ptl;
- int count = 0;
+ int count = 0, result = SCAN_FAIL;
int i;
+ mmap_assert_write_locked(mm);
+
+ /* Fast check before locking page if already PMD-mapped */
+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+ if (result == SCAN_PMD_MAPPED)
+ return result;
+
if (!vma || !vma->vm_file ||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
- return;
+ return SCAN_VMA_CHECK;
/*
- * This vm_flags may not have VM_HUGEPAGE if the page was not
- * collapsed by this mm. But we can still collapse if the page is
- * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
- * will not fail the vma for missing VM_HUGEPAGE
+ * If we are here, we've succeeded in replacing all the native pages
+ * in the page cache with a single hugepage. If a mm were to fault-in
+ * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
+ * and map it by a PMD, regardless of sysfs THP settings. As such, let's
+ * analogously elide sysfs THP settings here.
*/
- if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
- return;
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return SCAN_VMA_CHECK;
+
+ /*
+ * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
+ * that got written to. Without this, we'd have to also lock the
+ * anon_vma if one exists.
+ */
+ if (vma->anon_vma)
+ return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
- return;
+ return SCAN_PTE_UFFD_WP;
hpage = find_lock_page(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
if (!hpage)
- return;
+ return SCAN_PAGE_NULL;
- if (!PageHead(hpage))
+ if (!PageHead(hpage)) {
+ result = SCAN_FAIL;
goto drop_hpage;
+ }
- pmd = mm_find_pmd(mm, haddr);
- if (!pmd)
+ if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ result = SCAN_PAGE_COMPOUND;
goto drop_hpage;
+ }
+
+ switch (result) {
+ case SCAN_SUCCEED:
+ break;
+ case SCAN_PMD_NONE:
+ /*
+ * In MADV_COLLAPSE path, possible race with khugepaged where
+ * all pte entries have been removed and pmd cleared. If so,
+ * skip all the pte checks and just update the pmd mapping.
+ */
+ goto maybe_install_pmd;
+ default:
+ goto drop_hpage;
+ }
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ * See collapse_and_free_pmd().
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ result = SCAN_FAIL;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1436,8 +1528,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
continue;
/* page swapped out, abort */
- if (!pte_present(*pte))
+ if (!pte_present(*pte)) {
+ result = SCAN_PTE_NON_PRESENT;
goto abort;
+ }
page = vm_normal_page(vma, addr, *pte);
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
@@ -1472,21 +1566,32 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
- /* step 4: collapse pmd */
+ /* step 4: remove pte entries */
collapse_and_free_pmd(mm, vma, haddr, pmd);
+
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+maybe_install_pmd:
+ /* step 5: install pmd entry */
+ result = install_pmd
+ ? set_huge_pmd(vma, haddr, pmd, hpage)
+ : SCAN_SUCCEED;
+
drop_hpage:
unlock_page(hpage);
put_page(hpage);
- return;
+ return result;
abort:
pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
-static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
{
- struct mm_struct *mm = mm_slot->mm;
+ struct mm_slot *slot = &mm_slot->slot;
+ struct mm_struct *mm = slot->mm;
int i;
if (likely(mm_slot->nr_pte_mapped_thp == 0))
@@ -1495,26 +1600,33 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
if (!mmap_write_trylock(mm))
return;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto out;
for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
- collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
out:
mm_slot->nr_pte_mapped_thp = 0;
mmap_write_unlock(mm);
}
-static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ struct mm_struct *target_mm,
+ unsigned long target_addr, struct page *hpage,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
- unsigned long addr;
- pmd_t *pmd;
+ int target_result = SCAN_FAIL;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ int result = SCAN_FAIL;
+ struct mm_struct *mm = NULL;
+ unsigned long addr = 0;
+ pmd_t *pmd;
+ bool is_target = false;
+
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
* got written to. These VMAs are likely not worth investing
@@ -1529,27 +1641,37 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
- if (vma->anon_vma)
- continue;
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto next;
+ }
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (addr & ~HPAGE_PMD_MASK)
- continue;
- if (vma->vm_end < addr + HPAGE_PMD_SIZE)
- continue;
+ if (addr & ~HPAGE_PMD_MASK ||
+ vma->vm_end < addr + HPAGE_PMD_SIZE) {
+ result = SCAN_VMA_CHECK;
+ goto next;
+ }
mm = vma->vm_mm;
- pmd = mm_find_pmd(mm, addr);
- if (!pmd)
- continue;
+ is_target = mm == target_mm && addr == target_addr;
+ result = find_pmd_or_thp_or_none(mm, addr, &pmd);
+ if (result != SCAN_SUCCEED)
+ goto next;
/*
* We need exclusive mmap_lock to retract page table.
*
* We use trylock due to lock inversion: we need to acquire
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
+ *
+ * Also, it's not MADV_COLLAPSE's job to collapse other
+ * mappings - let khugepaged take care of them later.
*/
- if (mmap_write_trylock(mm)) {
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ if ((cc->is_khugepaged || is_target) &&
+ mmap_write_trylock(mm)) {
/*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte
@@ -1558,25 +1680,48 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* it'll always mapped in small page size for uffd-wp
* registered ranges.
*/
- if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
- collapse_and_free_pmd(mm, vma, addr, pmd);
+ if (hpage_collapse_test_exit(mm)) {
+ result = SCAN_ANY_PROCESS;
+ goto unlock_next;
+ }
+ if (userfaultfd_wp(vma)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto unlock_next;
+ }
+ collapse_and_free_pmd(mm, vma, addr, pmd);
+ if (!cc->is_khugepaged && is_target)
+ result = set_huge_pmd(vma, addr, pmd, hpage);
+ else
+ result = SCAN_SUCCEED;
+
+unlock_next:
mmap_write_unlock(mm);
- } else {
- /* Try again later */
+ goto next;
+ }
+ /*
+ * Calling context will handle target mm/addr. Otherwise, let
+ * khugepaged try again later.
+ */
+ if (!is_target) {
khugepaged_add_pte_mapped_thp(mm, addr);
+ continue;
}
+next:
+ if (is_target)
+ target_result = result;
}
i_mmap_unlock_write(mapping);
+ return target_result;
}
/**
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* @mm: process address space where collapse happens
+ * @addr: virtual collapse start address
* @file: file that collapse on
* @start: collapse start address
- * @hpage: new allocated huge page for collapse
- * @node: appointed node the new huge page allocate from
+ * @cc: collapse context and scratchpad
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
@@ -1593,37 +1738,25 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_file(struct mm_struct *mm,
- struct file *file, pgoff_t start,
- struct page **hpage, int node)
+static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct address_space *mapping = file->f_mapping;
- gfp_t gfp;
- struct page *new_page;
- pgoff_t index, end = start + HPAGE_PMD_NR;
+ struct page *hpage;
+ pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
- int nr;
+ int nr = 0;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out;
- }
-
- if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
- goto out;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/*
* Ensure we have slots for all the pages in the range. This is
@@ -1641,14 +1774,14 @@ static void collapse_file(struct mm_struct *mm,
}
} while (1);
- __SetPageLocked(new_page);
+ __SetPageLocked(hpage);
if (is_shmem)
- __SetPageSwapBacked(new_page);
- new_page->index = start;
- new_page->mapping = mapping;
+ __SetPageSwapBacked(hpage);
+ hpage->index = start;
+ hpage->mapping = mapping;
/*
- * At this point the new_page is locked and not up-to-date.
+ * At this point the hpage is locked and not up-to-date.
* It's safe to insert it into the page cache, because nobody would
* be able to map it or use it in another way until we unlock it.
*/
@@ -1656,6 +1789,7 @@ static void collapse_file(struct mm_struct *mm,
xas_set(&xas, start);
for (index = start; index < end; index++) {
struct page *page = xas_next(&xas);
+ struct folio *folio;
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
@@ -1676,7 +1810,7 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
nr_none++;
continue;
}
@@ -1684,11 +1818,12 @@ static void collapse_file(struct mm_struct *mm,
if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOALLOC)) {
+ if (shmem_get_folio(mapping->host, index,
+ &folio, SGP_NOALLOC)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
+ page = folio_file_page(folio, index);
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1755,19 +1890,28 @@ static void collapse_file(struct mm_struct *mm,
/*
* If file was truncated then extended, or hole-punched, before
* we locked the first page, then a THP might be there already.
+ * This will be discovered on the first iteration.
*/
if (PageTransCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ struct page *head = compound_head(page);
+
+ result = compound_order(head) == HPAGE_PMD_ORDER &&
+ head->index == start
+ /* Maybe PMD-mapped */
+ ? SCAN_PTE_MAPPED_HUGEPAGE
+ : SCAN_PAGE_COMPOUND;
goto out_unlock;
}
- if (page_mapping(page) != mapping) {
+ folio = page_folio(page);
+
+ if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
}
- if (!is_shmem && (PageDirty(page) ||
- PageWriteback(page))) {
+ if (!is_shmem && (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed
@@ -1777,20 +1921,20 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}
- if (isolate_lru_page(page)) {
+ if (folio_isolate_lru(folio)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL)) {
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
- putback_lru_page(page);
+ folio_putback_lru(folio);
goto out_unlock;
}
- if (page_mapped(page))
- try_to_unmap(page_folio(page),
+ if (folio_mapped(folio))
+ try_to_unmap(folio,
TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
@@ -1818,19 +1962,19 @@ static void collapse_file(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
continue;
out_unlock:
unlock_page(page);
put_page(page);
goto xa_unlocked;
}
- nr = thp_nr_pages(new_page);
+ nr = thp_nr_pages(hpage);
if (is_shmem)
- __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
+ __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
else {
- __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
/*
* Paired with smp_mb() in do_dentry_open() to ensure
@@ -1841,21 +1985,21 @@ out_unlock:
smp_mb();
if (inode_is_open_for_write(mapping->host)) {
result = SCAN_FAIL;
- __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
goto xa_locked;
}
}
if (nr_none) {
- __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+ __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
}
/* Join all the small entries into a single multi-index entry */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@@ -1869,6 +2013,7 @@ xa_unlocked:
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
+ struct folio *folio;
/*
* Replacing old pages with new one has succeeded, now we
@@ -1877,11 +2022,11 @@ xa_unlocked:
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
while (index < page->index) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
- page);
+ copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
+ page);
list_del(&page->lru);
page->mapping = NULL;
page_ref_unfreeze(page, 1);
@@ -1892,23 +2037,25 @@ xa_unlocked:
index++;
}
while (index < end) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- SetPageUptodate(new_page);
- page_ref_add(new_page, HPAGE_PMD_NR - 1);
+ folio = page_folio(hpage);
+ folio_mark_uptodate(folio);
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+
if (is_shmem)
- set_page_dirty(new_page);
- lru_cache_add(new_page);
+ folio_mark_dirty(folio);
+ folio_add_lru(folio);
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
- retract_page_tables(mapping, start);
- *hpage = NULL;
-
- khugepaged_pages_collapsed++;
+ result = retract_page_tables(mapping, start, mm, addr, hpage,
+ cc);
+ unlock_page(hpage);
+ hpage = NULL;
} else {
struct page *page;
@@ -1947,19 +2094,25 @@ xa_unlocked:
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- new_page->mapping = NULL;
+ hpage->mapping = NULL;
}
- unlock_page(new_page);
+ if (hpage)
+ unlock_page(hpage);
out:
VM_BUG_ON(!list_empty(&pagelist));
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(page_folio(*hpage));
- /* TODO: tracepoints */
+ if (hpage) {
+ mem_cgroup_uncharge(page_folio(hpage));
+ put_page(hpage);
+ }
+
+ trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
+ return result;
}
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
@@ -1970,14 +2123,17 @@ static void khugepaged_scan_file(struct mm_struct *mm,
present = 0;
swap = 0;
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
- if (++swap > khugepaged_max_ptes_swap) {
+ ++swap;
+ if (cc->is_khugepaged &&
+ swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
@@ -1986,20 +2142,32 @@ static void khugepaged_scan_file(struct mm_struct *mm,
}
/*
- * XXX: khugepaged should compact smaller compound pages
+ * TODO: khugepaged should compact smaller compound pages
* into a PMD sized page
*/
if (PageTransCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ struct page *head = compound_head(page);
+
+ result = compound_order(head) == HPAGE_PMD_ORDER &&
+ head->index == start
+ /* Maybe PMD-mapped */
+ ? SCAN_PTE_MAPPED_HUGEPAGE
+ : SCAN_PAGE_COMPOUND;
+ /*
+ * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
+ * by the caller won't touch the page cache, and so
+ * it's safe to skip LRU and refcount checks before
+ * returning.
+ */
break;
}
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
break;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
@@ -2028,54 +2196,67 @@ static void khugepaged_scan_file(struct mm_struct *mm,
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (cc->is_khugepaged &&
+ present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
- node = khugepaged_find_target_node();
- collapse_file(mm, file, start, hpage, node);
+ result = collapse_file(mm, addr, file, start, cc);
}
}
- /* TODO: tracepoints */
+ trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
+ return result;
}
#else
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
BUILD_BUG();
}
-static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
+{
+}
+
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
{
+ return false;
}
#endif
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
- struct page **hpage)
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+ struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
- struct mm_slot *mm_slot;
+ struct vma_iterator vmi;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
+ *result = SCAN_FAIL;
- if (khugepaged_scan.mm_slot)
+ if (khugepaged_scan.mm_slot) {
mm_slot = khugepaged_scan.mm_slot;
- else {
- mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = &mm_slot->slot;
+ } else {
+ slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
khugepaged_collapse_pte_mapped_thps(mm_slot);
- mm = mm_slot->mm;
+ mm = slot->mm;
/*
* Don't wait for semaphore (to avoid long wait times). Just move to
* the next mm on the list.
@@ -2083,19 +2264,21 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
- if (likely(!khugepaged_test_exit(mm)))
- vma = find_vma(mm, khugepaged_scan.address);
progress++;
- for (; vma; vma = vma->vm_next) {
+ if (unlikely(hpage_collapse_test_exit(mm)))
+ goto breakouterloop;
+
+ vma_iter_init(&vmi, mm, khugepaged_scan.address);
+ for_each_vma(vmi, vma) {
unsigned long hstart, hend;
cond_resched();
- if (unlikely(khugepaged_test_exit(mm))) {
+ if (unlikely(hpage_collapse_test_exit(mm))) {
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
skip:
progress++;
continue;
@@ -2109,9 +2292,10 @@ skip:
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
- int ret;
+ bool mmap_locked = true;
+
cond_resched();
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2123,19 +2307,48 @@ skip:
khugepaged_scan.address);
mmap_read_unlock(mm);
- ret = 1;
- khugepaged_scan_file(mm, file, pgoff, hpage);
+ *result = hpage_collapse_scan_file(mm,
+ khugepaged_scan.address,
+ file, pgoff, cc);
+ mmap_locked = false;
fput(file);
} else {
- ret = khugepaged_scan_pmd(mm, vma,
- khugepaged_scan.address,
- hpage);
+ *result = hpage_collapse_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ &mmap_locked,
+ cc);
+ }
+ switch (*result) {
+ case SCAN_PTE_MAPPED_HUGEPAGE: {
+ pmd_t *pmd;
+
+ *result = find_pmd_or_thp_or_none(mm,
+ khugepaged_scan.address,
+ &pmd);
+ if (*result != SCAN_SUCCEED)
+ break;
+ if (!khugepaged_add_pte_mapped_thp(mm,
+ khugepaged_scan.address))
+ break;
+ } fallthrough;
+ case SCAN_SUCCEED:
+ ++khugepaged_pages_collapsed;
+ break;
+ default:
+ break;
}
+
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
- if (ret)
- /* we released mmap_lock so break loop */
+ if (!mmap_locked)
+ /*
+ * We released mmap_lock so break loop. Note
+ * that we drop mmap_lock before all hugepage
+ * allocations, so if allocation fails, we are
+ * guaranteed to break here and report the
+ * correct result back to caller.
+ */
goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
@@ -2151,16 +2364,17 @@ breakouterloop_mmap_lock:
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (khugepaged_test_exit(mm) || !vma) {
+ if (hpage_collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
- khugepaged_scan.mm_slot = list_entry(
- mm_slot->mm_node.next,
- struct mm_slot, mm_node);
+ if (slot->mm_node.next != &khugepaged_scan.mm_head) {
+ slot = list_entry(slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.mm_slot =
+ mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
@@ -2185,19 +2399,16 @@ static int khugepaged_wait_event(void)
kthread_should_stop();
}
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
{
- struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
+ int result = SCAN_SUCCEED;
lru_add_drain_all();
- while (progress < pages) {
- if (!khugepaged_prealloc_page(&hpage, &wait))
- break;
-
+ while (true) {
cond_resched();
if (unlikely(kthread_should_stop() || try_to_freeze()))
@@ -2209,14 +2420,25 @@ static void khugepaged_do_scan(void)
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
- &hpage);
+ &result, cc);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
- }
- if (!IS_ERR_OR_NULL(hpage))
- put_page(hpage);
+ if (progress >= pages)
+ break;
+
+ if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
+ /*
+ * If fail to allocate the first time, try to sleep for
+ * a while. When hit again, cancel the scan.
+ */
+ if (!wait)
+ break;
+ wait = false;
+ khugepaged_alloc_sleep();
+ }
+ }
}
static bool khugepaged_should_wakeup(void)
@@ -2247,13 +2469,13 @@ static void khugepaged_wait_work(void)
static int khugepaged(void *none)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
set_freezable();
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
- khugepaged_do_scan();
+ khugepaged_do_scan(&khugepaged_collapse_control);
khugepaged_wait_work();
}
@@ -2352,3 +2574,145 @@ void khugepaged_min_free_kbytes_update(void)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
+
+bool current_is_khugepaged(void)
+{
+ return kthread_func(current) == khugepaged;
+}
+
+static int madvise_collapse_errno(enum scan_result r)
+{
+ /*
+ * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
+ * actionable feedback to caller, so they may take an appropriate
+ * fallback measure depending on the nature of the failure.
+ */
+ switch (r) {
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ return -ENOMEM;
+ case SCAN_CGROUP_CHARGE_FAIL:
+ return -EBUSY;
+ /* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_LRU:
+ case SCAN_DEL_PAGE_LRU:
+ return -EAGAIN;
+ /*
+ * Other: Trying again likely not to succeed / error intrinsic to
+ * specified memory range. khugepaged likely won't be able to collapse
+ * either.
+ */
+ default:
+ return -EINVAL;
+ }
+}
+
+int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct collapse_control *cc;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long hstart, hend, addr;
+ int thps = 0, last_fail = SCAN_FAIL;
+ bool mmap_locked = true;
+
+ BUG_ON(vma->vm_start > start);
+ BUG_ON(vma->vm_end < end);
+
+ *prev = vma;
+
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return -EINVAL;
+
+ cc = kmalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+ cc->is_khugepaged = false;
+
+ mmgrab(mm);
+ lru_add_drain_all();
+
+ hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = end & HPAGE_PMD_MASK;
+
+ for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
+ int result = SCAN_FAIL;
+
+ if (!mmap_locked) {
+ cond_resched();
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, addr, false, &vma,
+ cc);
+ if (result != SCAN_SUCCEED) {
+ last_fail = result;
+ goto out_nolock;
+ }
+
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ }
+ mmap_assert_locked(mm);
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ mmap_locked = false;
+ result = hpage_collapse_scan_file(mm, addr, file, pgoff,
+ cc);
+ fput(file);
+ } else {
+ result = hpage_collapse_scan_pmd(mm, vma, addr,
+ &mmap_locked, cc);
+ }
+ if (!mmap_locked)
+ *prev = NULL; /* Tell caller we dropped mmap_lock */
+
+handle_result:
+ switch (result) {
+ case SCAN_SUCCEED:
+ case SCAN_PMD_MAPPED:
+ ++thps;
+ break;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
+ BUG_ON(mmap_locked);
+ BUG_ON(*prev);
+ mmap_write_lock(mm);
+ result = collapse_pte_mapped_thp(mm, addr, true);
+ mmap_write_unlock(mm);
+ goto handle_result;
+ /* Whitelisted set of results where continuing OK */
+ case SCAN_PMD_NULL:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_PAGE_RO:
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_PAGE_NULL:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COMPOUND:
+ case SCAN_PAGE_LRU:
+ case SCAN_DEL_PAGE_LRU:
+ last_fail = result;
+ break;
+ default:
+ last_fail = result;
+ /* Other error, exit */
+ goto out_maybelock;
+ }
+ }
+
+out_maybelock:
+ /* Caller expects us to hold mmap_lock on return */
+ if (!mmap_locked)
+ mmap_read_lock(mm);
+out_nolock:
+ mmap_assert_locked(mm);
+ mmdrop(mm);
+ kfree(cc);
+
+ return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
+ : madvise_collapse_errno(last_fail);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 1eddc0132f7f..92f670edbf51 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -79,6 +79,7 @@
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
+#include <linux/stackdepot.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/memblock.h>
@@ -159,8 +160,7 @@ struct kmemleak_object {
u32 checksum;
/* memory ranges to be scanned inside an object (empty for all) */
struct hlist_head area_list;
- unsigned long trace[MAX_TRACE];
- unsigned int trace_len;
+ depot_stack_handle_t trace_handle;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
char comm[TASK_COMM_LEN]; /* executable name */
@@ -346,19 +346,22 @@ static void print_unreferenced(struct seq_file *seq,
struct kmemleak_object *object)
{
int i;
+ unsigned long *entries;
+ unsigned int nr_entries;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
+ nr_entries = stack_depot_fetch(object->trace_handle, &entries);
warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
- object->comm, object->pid, object->jiffies,
- msecs_age / 1000, msecs_age % 1000);
+ object->comm, object->pid, object->jiffies,
+ msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
warn_or_seq_printf(seq, " backtrace:\n");
- for (i = 0; i < object->trace_len; i++) {
- void *ptr = (void *)object->trace[i];
- warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ for (i = 0; i < nr_entries; i++) {
+ void *ptr = (void *)entries[i];
+ warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr);
}
}
@@ -370,15 +373,16 @@ static void print_unreferenced(struct seq_file *seq,
static void dump_object_info(struct kmemleak_object *object)
{
pr_notice("Object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
- object->comm, object->pid, object->jiffies);
+ object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%x\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
- stack_trace_print(object->trace, object->trace_len, 4);
+ if (object->trace_handle)
+ stack_depot_print(object->trace_handle);
}
/*
@@ -591,12 +595,18 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
return object;
}
-/*
- * Save stack trace to the given array of MAX_TRACE size.
- */
-static int __save_stack_trace(unsigned long *trace)
+static noinline depot_stack_handle_t set_track_prepare(void)
{
- return stack_trace_save(trace, MAX_TRACE, 2);
+ depot_stack_handle_t trace_handle;
+ unsigned long entries[MAX_TRACE];
+ unsigned int nr_entries;
+
+ if (!kmemleak_initialized)
+ return 0;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
+ trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+
+ return trace_handle;
}
/*
@@ -604,9 +614,8 @@ static int __save_stack_trace(unsigned long *trace)
* memory block and add it to the object_list and object_tree_root (or
* object_phys_tree_root).
*/
-static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp,
- bool is_phys)
+static void __create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp, bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
@@ -618,7 +627,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
- return NULL;
+ return;
}
INIT_LIST_HEAD(&object->object_list);
@@ -654,7 +663,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
}
/* kernel backtrace */
- object->trace_len = __save_stack_trace(object->trace);
+ object->trace_handle = set_track_prepare();
raw_spin_lock_irqsave(&kmemleak_lock, flags);
@@ -687,32 +696,29 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
*/
dump_object_info(parent);
kmem_cache_free(object_cache, object);
- object = NULL;
goto out;
}
}
rb_link_node(&object->rb_node, rb_parent, link);
rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
&object_tree_root);
-
list_add_tail_rcu(&object->object_list, &object_list);
out:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- return object;
}
/* Create kmemleak object which allocated with virtual address. */
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
{
- return __create_object(ptr, size, min_count, gfp, false);
+ __create_object(ptr, size, min_count, gfp, false);
}
/* Create kmemleak object which allocated with physical address. */
-static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void create_object_phys(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
{
- return __create_object(ptr, size, min_count, gfp, true);
+ __create_object(ptr, size, min_count, gfp, true);
}
/*
@@ -1094,7 +1100,7 @@ void __ref kmemleak_update_trace(const void *ptr)
}
raw_spin_lock_irqsave(&object->lock, flags);
- object->trace_len = __save_stack_trace(object->trace);
+ object->trace_handle = set_track_prepare();
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
@@ -1464,6 +1470,27 @@ static void scan_gray_list(void)
}
/*
+ * Conditionally call resched() in an object iteration loop while making sure
+ * that the given object won't go away without RCU read lock by performing a
+ * get_object() if !pinned.
+ *
+ * Return: false if can't do a cond_resched() due to get_object() failure
+ * true otherwise
+ */
+static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned)
+{
+ if (!pinned && !get_object(object))
+ return false;
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ if (!pinned)
+ put_object(object);
+ return true;
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1474,7 +1501,7 @@ static void kmemleak_scan(void)
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
- int loop1_cnt = 0;
+ int loop_cnt = 0;
jiffies_last_scan = jiffies;
@@ -1483,7 +1510,6 @@ static void kmemleak_scan(void)
list_for_each_entry_rcu(object, &object_list, object_list) {
bool obj_pinned = false;
- loop1_cnt++;
raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
@@ -1517,24 +1543,11 @@ static void kmemleak_scan(void)
raw_spin_unlock_irq(&object->lock);
/*
- * Do a cond_resched() to avoid soft lockup every 64k objects.
- * Make sure a reference has been taken so that the object
- * won't go away without RCU read lock.
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
*/
- if (!(loop1_cnt & 0xffff)) {
- if (!obj_pinned && !get_object(object)) {
- /* Try the next object instead */
- loop1_cnt--;
- continue;
- }
-
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
-
- if (!obj_pinned)
- put_object(object);
- }
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, obj_pinned))
+ loop_cnt--; /* Try again on next object */
}
rcu_read_unlock();
@@ -1601,8 +1614,16 @@ static void kmemleak_scan(void)
* scan and color them gray until the next scan.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
@@ -1635,8 +1656,16 @@ static void kmemleak_scan(void)
* Scanning result reporting.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
@@ -2064,6 +2093,7 @@ void __init kmemleak_init(void)
if (kmemleak_error)
return;
+ stack_depot_init();
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile
new file mode 100644
index 000000000000..98eab2856626
--- /dev/null
+++ b/mm/kmsan/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for KernelMemorySanitizer (KMSAN).
+#
+#
+obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o
+
+KMSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+UBSAN_SANITIZE := n
+
+# Disable instrumentation of KMSAN runtime with other tools.
+CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector
+CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack)
+CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
+
+CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME)
+
+obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o
+KMSAN_SANITIZE_kmsan_test.o := y
+CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized)
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
new file mode 100644
index 000000000000..112dce135c7f
--- /dev/null
+++ b/mm/kmsan/core.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN runtime library.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmsan_types.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mmzone.h>
+#include <linux/percpu-defs.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "../slab.h"
+#include "kmsan.h"
+
+bool kmsan_enabled __read_mostly;
+
+/*
+ * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is
+ * unavaliable.
+ */
+DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+void kmsan_internal_task_create(struct task_struct *task)
+{
+ struct kmsan_ctx *ctx = &task->kmsan_ctx;
+ struct thread_info *info = current_thread_info();
+
+ __memset(ctx, 0, sizeof(*ctx));
+ ctx->allow_reporting = true;
+ kmsan_internal_unpoison_memory(info, sizeof(*info), false);
+}
+
+void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags,
+ unsigned int poison_flags)
+{
+ u32 extra_bits =
+ kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE);
+ bool checked = poison_flags & KMSAN_POISON_CHECK;
+ depot_stack_handle_t handle;
+
+ handle = kmsan_save_stack_with_flags(flags, extra_bits);
+ kmsan_internal_set_shadow_origin(address, size, -1, handle, checked);
+}
+
+void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked)
+{
+ kmsan_internal_set_shadow_origin(address, size, 0, 0, checked);
+}
+
+depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
+ unsigned int extra)
+{
+ unsigned long entries[KMSAN_STACK_DEPTH];
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
+
+ /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */
+ flags &= ~__GFP_DIRECT_RECLAIM;
+
+ return __stack_depot_save(entries, nr_entries, extra, flags, true);
+}
+
+/* Copy the metadata following the memmove() behavior. */
+void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n)
+{
+ depot_stack_handle_t old_origin = 0, new_origin = 0;
+ int src_slots, dst_slots, i, iter, step, skip_bits;
+ depot_stack_handle_t *origin_src, *origin_dst;
+ void *shadow_src, *shadow_dst;
+ u32 *align_shadow_src, shadow;
+ bool backwards;
+
+ shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW);
+ if (!shadow_dst)
+ return;
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n));
+
+ shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW);
+ if (!shadow_src) {
+ /*
+ * @src is untracked: zero out destination shadow, ignore the
+ * origins, we're done.
+ */
+ __memset(shadow_dst, 0, n);
+ return;
+ }
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n));
+
+ __memmove(shadow_dst, shadow_src, n);
+
+ origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN);
+ origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN);
+ KMSAN_WARN_ON(!origin_dst || !origin_src);
+ src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) -
+ ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) /
+ KMSAN_ORIGIN_SIZE;
+ dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) -
+ ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) /
+ KMSAN_ORIGIN_SIZE;
+ KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1));
+ KMSAN_WARN_ON((src_slots - dst_slots > 1) ||
+ (dst_slots - src_slots < -1));
+
+ backwards = dst > src;
+ i = backwards ? min(src_slots, dst_slots) - 1 : 0;
+ iter = backwards ? -1 : 1;
+
+ align_shadow_src =
+ (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE);
+ for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) {
+ KMSAN_WARN_ON(i < 0);
+ shadow = align_shadow_src[i];
+ if (i == 0) {
+ /*
+ * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't
+ * look at the first @src % KMSAN_ORIGIN_SIZE bytes
+ * of the first shadow slot.
+ */
+ skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow >> skip_bits) << skip_bits;
+ }
+ if (i == src_slots - 1) {
+ /*
+ * If @src + n isn't aligned on
+ * KMSAN_ORIGIN_SIZE, don't look at the last
+ * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the
+ * last shadow slot.
+ */
+ skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow << skip_bits) >> skip_bits;
+ }
+ /*
+ * Overwrite the origin only if the corresponding
+ * shadow is nonempty.
+ */
+ if (origin_src[i] && (origin_src[i] != old_origin) && shadow) {
+ old_origin = origin_src[i];
+ new_origin = kmsan_internal_chain_origin(old_origin);
+ /*
+ * kmsan_internal_chain_origin() may return
+ * NULL, but we don't want to lose the previous
+ * origin value.
+ */
+ if (!new_origin)
+ new_origin = old_origin;
+ }
+ if (shadow)
+ origin_dst[i] = new_origin;
+ else
+ origin_dst[i] = 0;
+ }
+ /*
+ * If dst_slots is greater than src_slots (i.e.
+ * dst_slots == src_slots + 1), there is an extra origin slot at the
+ * beginning or end of the destination buffer, for which we take the
+ * origin from the previous slot.
+ * This is only done if the part of the source shadow corresponding to
+ * slot is non-zero.
+ *
+ * E.g. if we copy 8 aligned bytes that are marked as uninitialized
+ * and have origins o111 and o222, to an unaligned buffer with offset 1,
+ * these two origins are copied to three origin slots, so one of then
+ * needs to be duplicated, depending on the copy direction (@backwards)
+ *
+ * src shadow: |uuuu|uuuu|....|
+ * src origin: |o111|o222|....|
+ *
+ * backwards = 0:
+ * dst shadow: |.uuu|uuuu|u...|
+ * dst origin: |....|o111|o222| - fill the empty slot with o111
+ * backwards = 1:
+ * dst shadow: |.uuu|uuuu|u...|
+ * dst origin: |o111|o222|....| - fill the empty slot with o222
+ */
+ if (src_slots < dst_slots) {
+ if (backwards) {
+ shadow = align_shadow_src[src_slots - 1];
+ skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow << skip_bits) >> skip_bits;
+ if (shadow)
+ /* src_slots > 0, therefore dst_slots is at least 2 */
+ origin_dst[dst_slots - 1] =
+ origin_dst[dst_slots - 2];
+ } else {
+ shadow = align_shadow_src[0];
+ skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow >> skip_bits) << skip_bits;
+ if (shadow)
+ origin_dst[0] = origin_dst[1];
+ }
+ }
+}
+
+depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
+{
+ unsigned long entries[3];
+ u32 extra_bits;
+ int depth;
+ bool uaf;
+
+ if (!id)
+ return id;
+ /*
+ * Make sure we have enough spare bits in @id to hold the UAF bit and
+ * the chain depth.
+ */
+ BUILD_BUG_ON(
+ (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+
+ extra_bits = stack_depot_get_extra_bits(id);
+ depth = kmsan_depth_from_eb(extra_bits);
+ uaf = kmsan_uaf_from_eb(extra_bits);
+
+ /*
+ * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH.
+ * This mostly happens in the case structures with uninitialized padding
+ * are copied around many times. Origin chains for such structures are
+ * usually periodic, and it does not make sense to fully store them.
+ */
+ if (depth == KMSAN_MAX_ORIGIN_DEPTH)
+ return id;
+
+ depth++;
+ extra_bits = kmsan_extra_bits(depth, uaf);
+
+ entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN;
+ entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0);
+ entries[2] = id;
+ /*
+ * @entries is a local var in non-instrumented code, so KMSAN does not
+ * know it is initialized. Explicitly unpoison it to avoid false
+ * positives when __stack_depot_save() passes it to instrumented code.
+ */
+ kmsan_internal_unpoison_memory(entries, sizeof(entries), false);
+ return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits,
+ GFP_ATOMIC, true);
+}
+
+void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
+ u32 origin, bool checked)
+{
+ u64 address = (u64)addr;
+ void *shadow_start;
+ u32 *origin_start;
+ size_t pad = 0;
+
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
+ shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW);
+ if (!shadow_start) {
+ /*
+ * kmsan_metadata_is_contiguous() is true, so either all shadow
+ * and origin pages are NULL, or all are non-NULL.
+ */
+ if (checked) {
+ pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n",
+ __func__, size, addr);
+ KMSAN_WARN_ON(true);
+ }
+ return;
+ }
+ __memset(shadow_start, b, size);
+
+ if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ pad = address % KMSAN_ORIGIN_SIZE;
+ address -= pad;
+ size += pad;
+ }
+ size = ALIGN(size, KMSAN_ORIGIN_SIZE);
+ origin_start =
+ (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN);
+
+ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++)
+ origin_start[i] = origin;
+}
+
+struct page *kmsan_vmalloc_to_page_or_null(void *vaddr)
+{
+ struct page *page;
+
+ if (!kmsan_internal_is_vmalloc_addr(vaddr) &&
+ !kmsan_internal_is_module_addr(vaddr))
+ return NULL;
+ page = vmalloc_to_page(vaddr);
+ if (pfn_valid(page_to_pfn(page)))
+ return page;
+ else
+ return NULL;
+}
+
+void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr,
+ int reason)
+{
+ depot_stack_handle_t cur_origin = 0, new_origin = 0;
+ unsigned long addr64 = (unsigned long)addr;
+ depot_stack_handle_t *origin = NULL;
+ unsigned char *shadow = NULL;
+ int cur_off_start = -1;
+ int chunk_size;
+ size_t pos = 0;
+
+ if (!size)
+ return;
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
+ while (pos < size) {
+ chunk_size = min(size - pos,
+ PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE));
+ shadow = kmsan_get_metadata((void *)(addr64 + pos),
+ KMSAN_META_SHADOW);
+ if (!shadow) {
+ /*
+ * This page is untracked. If there were uninitialized
+ * bytes before, report them.
+ */
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos - 1, user_addr,
+ reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = 0;
+ cur_off_start = -1;
+ pos += chunk_size;
+ continue;
+ }
+ for (int i = 0; i < chunk_size; i++) {
+ if (!shadow[i]) {
+ /*
+ * This byte is unpoisoned. If there were
+ * poisoned bytes before, report them.
+ */
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos + i - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = 0;
+ cur_off_start = -1;
+ continue;
+ }
+ origin = kmsan_get_metadata((void *)(addr64 + pos + i),
+ KMSAN_META_ORIGIN);
+ KMSAN_WARN_ON(!origin);
+ new_origin = *origin;
+ /*
+ * Encountered new origin - report the previous
+ * uninitialized range.
+ */
+ if (cur_origin != new_origin) {
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos + i - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = new_origin;
+ cur_off_start = pos + i;
+ }
+ }
+ pos += chunk_size;
+ }
+ KMSAN_WARN_ON(pos != size);
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+}
+
+bool kmsan_metadata_is_contiguous(void *addr, size_t size)
+{
+ char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL,
+ *next_origin = NULL;
+ u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE;
+ depot_stack_handle_t *origin_p;
+ bool all_untracked = false;
+
+ if (!size)
+ return true;
+
+ /* The whole range belongs to the same page. */
+ if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) ==
+ ALIGN_DOWN(cur_addr, PAGE_SIZE))
+ return true;
+
+ cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false);
+ if (!cur_shadow)
+ all_untracked = true;
+ cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true);
+ if (all_untracked && cur_origin)
+ goto report;
+
+ for (; next_addr < (u64)addr + size;
+ cur_addr = next_addr, cur_shadow = next_shadow,
+ cur_origin = next_origin, next_addr += PAGE_SIZE) {
+ next_shadow = kmsan_get_metadata((void *)next_addr, false);
+ next_origin = kmsan_get_metadata((void *)next_addr, true);
+ if (all_untracked) {
+ if (next_shadow || next_origin)
+ goto report;
+ if (!next_shadow && !next_origin)
+ continue;
+ }
+ if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) &&
+ ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE)))
+ continue;
+ goto report;
+ }
+ return true;
+
+report:
+ pr_err("%s: attempting to access two shadow page ranges.\n", __func__);
+ pr_err("Access of size %ld at %px.\n", size, addr);
+ pr_err("Addresses belonging to different ranges: %px and %px\n",
+ (void *)cur_addr, (void *)next_addr);
+ pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow,
+ next_shadow);
+ pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin,
+ next_origin);
+ origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN);
+ if (origin_p) {
+ pr_err("Origin: %08x\n", *origin_p);
+ kmsan_print_origin(*origin_p);
+ } else {
+ pr_err("Origin: unavailable\n");
+ }
+ return false;
+}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
new file mode 100644
index 000000000000..3807502766a3
--- /dev/null
+++ b/mm/kmsan/hooks.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN hooks for kernel subsystems.
+ *
+ * These functions handle creation of KMSAN metadata for memory allocations.
+ *
+ * Copyright (C) 2018-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <linux/cacheflush.h>
+#include <linux/dma-direction.h>
+#include <linux/gfp.h>
+#include <linux/kmsan.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/usb.h>
+
+#include "../internal.h"
+#include "../slab.h"
+#include "kmsan.h"
+
+/*
+ * Instrumented functions shouldn't be called under
+ * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to
+ * skipping effects of functions like memset() inside instrumented code.
+ */
+
+void kmsan_task_create(struct task_struct *task)
+{
+ kmsan_enter_runtime();
+ kmsan_internal_task_create(task);
+ kmsan_leave_runtime();
+}
+
+void kmsan_task_exit(struct task_struct *task)
+{
+ struct kmsan_ctx *ctx = &task->kmsan_ctx;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ctx->allow_reporting = false;
+}
+
+void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (unlikely(object == NULL))
+ return;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ /*
+ * There's a ctor or this is an RCU cache - do nothing. The memory
+ * status hasn't changed since last use.
+ */
+ if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU))
+ return;
+
+ kmsan_enter_runtime();
+ if (flags & __GFP_ZERO)
+ kmsan_internal_unpoison_memory(object, s->object_size,
+ KMSAN_POISON_CHECK);
+ else
+ kmsan_internal_poison_memory(object, s->object_size, flags,
+ KMSAN_POISON_CHECK);
+ kmsan_leave_runtime();
+}
+
+void kmsan_slab_free(struct kmem_cache *s, void *object)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)))
+ return;
+ /*
+ * If there's a constructor, freed memory must remain in the same state
+ * until the next allocation. We cannot save its state to detect
+ * use-after-free bugs, instead we just keep it unpoisoned.
+ */
+ if (s->ctor)
+ return;
+ kmsan_enter_runtime();
+ kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+ if (unlikely(ptr == NULL))
+ return;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ if (flags & __GFP_ZERO)
+ kmsan_internal_unpoison_memory((void *)ptr, size,
+ /*checked*/ true);
+ else
+ kmsan_internal_poison_memory((void *)ptr, size, flags,
+ KMSAN_POISON_CHECK);
+ kmsan_leave_runtime();
+}
+
+void kmsan_kfree_large(const void *ptr)
+{
+ struct page *page;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ page = virt_to_head_page((void *)ptr);
+ KMSAN_WARN_ON(ptr != page_address(page));
+ kmsan_internal_poison_memory((void *)ptr,
+ PAGE_SIZE << compound_order(page),
+ GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+static unsigned long vmalloc_shadow(unsigned long addr)
+{
+ return (unsigned long)kmsan_get_metadata((void *)addr,
+ KMSAN_META_SHADOW);
+}
+
+static unsigned long vmalloc_origin(unsigned long addr)
+{
+ return (unsigned long)kmsan_get_metadata((void *)addr,
+ KMSAN_META_ORIGIN);
+}
+
+void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end));
+ __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end));
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+}
+
+/*
+ * This function creates new shadow/origin pages for the physical pages mapped
+ * into the virtual memory. If those physical pages already had shadow/origin,
+ * those are ignored.
+ */
+void kmsan_ioremap_page_range(unsigned long start, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift)
+{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO;
+ struct page *shadow, *origin;
+ unsigned long off = 0;
+ int nr;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ kmsan_enter_runtime();
+ for (int i = 0; i < nr; i++, off += PAGE_SIZE) {
+ shadow = alloc_pages(gfp_mask, 1);
+ origin = alloc_pages(gfp_mask, 1);
+ __vmap_pages_range_noflush(
+ vmalloc_shadow(start + off),
+ vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow,
+ PAGE_SHIFT);
+ __vmap_pages_range_noflush(
+ vmalloc_origin(start + off),
+ vmalloc_origin(start + off + PAGE_SIZE), prot, &origin,
+ PAGE_SHIFT);
+ }
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+ kmsan_leave_runtime();
+}
+
+void kmsan_iounmap_page_range(unsigned long start, unsigned long end)
+{
+ unsigned long v_shadow, v_origin;
+ struct page *shadow, *origin;
+ int nr;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ kmsan_enter_runtime();
+ v_shadow = (unsigned long)vmalloc_shadow(start);
+ v_origin = (unsigned long)vmalloc_origin(start);
+ for (int i = 0; i < nr;
+ i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) {
+ shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow);
+ origin = kmsan_vmalloc_to_page_or_null((void *)v_origin);
+ __vunmap_range_noflush(v_shadow, vmalloc_shadow(end));
+ __vunmap_range_noflush(v_origin, vmalloc_origin(end));
+ if (shadow)
+ __free_pages(shadow, 1);
+ if (origin)
+ __free_pages(origin, 1);
+ }
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+ kmsan_leave_runtime();
+}
+
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+ size_t left)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ /*
+ * At this point we've copied the memory already. It's hard to check it
+ * before copying, as the size of actually copied buffer is unknown.
+ */
+
+ /* copy_to_user() may copy zero bytes. No need to check. */
+ if (!to_copy)
+ return;
+ /* Or maybe copy_to_user() failed to copy anything. */
+ if (to_copy <= left)
+ return;
+
+ ua_flags = user_access_save();
+ if ((u64)to < TASK_SIZE) {
+ /* This is a user memory access, check it. */
+ kmsan_internal_check_memory((void *)from, to_copy - left, to,
+ REASON_COPY_TO_USER);
+ } else {
+ /* Otherwise this is a kernel memory access. This happens when a
+ * compat syscall passes an argument allocated on the kernel
+ * stack to a real syscall.
+ * Don't check anything, just copy the shadow of the copied
+ * bytes.
+ */
+ kmsan_internal_memmove_metadata((void *)to, (void *)from,
+ to_copy - left);
+ }
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(kmsan_copy_to_user);
+
+/* Helper function to check an URB. */
+void kmsan_handle_urb(const struct urb *urb, bool is_out)
+{
+ if (!urb)
+ return;
+ if (is_out)
+ kmsan_internal_check_memory(urb->transfer_buffer,
+ urb->transfer_buffer_length,
+ /*user_addr*/ 0, REASON_SUBMIT_URB);
+ else
+ kmsan_internal_unpoison_memory(urb->transfer_buffer,
+ urb->transfer_buffer_length,
+ /*checked*/ false);
+}
+EXPORT_SYMBOL_GPL(kmsan_handle_urb);
+
+static void kmsan_handle_dma_page(const void *addr, size_t size,
+ enum dma_data_direction dir)
+{
+ switch (dir) {
+ case DMA_BIDIRECTIONAL:
+ kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+ kmsan_internal_unpoison_memory((void *)addr, size,
+ /*checked*/ false);
+ break;
+ case DMA_TO_DEVICE:
+ kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+ break;
+ case DMA_FROM_DEVICE:
+ kmsan_internal_unpoison_memory((void *)addr, size,
+ /*checked*/ false);
+ break;
+ case DMA_NONE:
+ break;
+ }
+}
+
+/* Helper function to handle DMA data transfers. */
+void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+ enum dma_data_direction dir)
+{
+ u64 page_offset, to_go, addr;
+
+ if (PageHighMem(page))
+ return;
+ addr = (u64)page_address(page) + offset;
+ /*
+ * The kernel may occasionally give us adjacent DMA pages not belonging
+ * to the same allocation. Process them separately to avoid triggering
+ * internal KMSAN checks.
+ */
+ while (size > 0) {
+ page_offset = addr % PAGE_SIZE;
+ to_go = min(PAGE_SIZE - page_offset, (u64)size);
+ kmsan_handle_dma_page((void *)addr, to_go, dir);
+ addr += to_go;
+ size -= to_go;
+ }
+}
+
+void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+ struct scatterlist *item;
+ int i;
+
+ for_each_sg(sg, item, nents, i)
+ kmsan_handle_dma(sg_page(item), item->offset, item->length,
+ dir);
+}
+
+/* Functions from kmsan-checks.h follow. */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ /* The users may want to poison/unpoison random memory. */
+ kmsan_internal_poison_memory((void *)address, size, flags,
+ KMSAN_POISON_NOCHECK);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_poison_memory);
+
+void kmsan_unpoison_memory(const void *address, size_t size)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ kmsan_enter_runtime();
+ /* The users may want to poison/unpoison random memory. */
+ kmsan_internal_unpoison_memory((void *)address, size,
+ KMSAN_POISON_NOCHECK);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(kmsan_unpoison_memory);
+
+/*
+ * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
+ * runtime.
+ *
+ * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
+ * code. Those regs need to be unpoisoned, otherwise using them will result in
+ * false positives.
+ * Using kmsan_unpoison_memory() is not an option in entry code, because the
+ * return value of in_task() is inconsistent - as a result, certain calls to
+ * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
+ * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
+ * entry code.
+ */
+void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled)
+ return;
+
+ ua_flags = user_access_save();
+ kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
+ KMSAN_POISON_NOCHECK);
+ user_access_restore(ua_flags);
+}
+
+void kmsan_check_memory(const void *addr, size_t size)
+{
+ if (!kmsan_enabled)
+ return;
+ return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+}
+EXPORT_SYMBOL(kmsan_check_memory);
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
new file mode 100644
index 000000000000..7fb794242fad
--- /dev/null
+++ b/mm/kmsan/init.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN initialization routines.
+ *
+ * Copyright (C) 2017-2021 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include "kmsan.h"
+
+#include <asm/sections.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include "../internal.h"
+
+#define NUM_FUTURE_RANGES 128
+struct start_end_pair {
+ u64 start, end;
+};
+
+static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata;
+static int future_index __initdata;
+
+/*
+ * Record a range of memory for which the metadata pages will be created once
+ * the page allocator becomes available.
+ */
+static void __init kmsan_record_future_shadow_range(void *start, void *end)
+{
+ u64 nstart = (u64)start, nend = (u64)end, cstart, cend;
+ bool merged = false;
+
+ KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
+ KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend);
+ nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
+ nend = ALIGN(nend, PAGE_SIZE);
+
+ /*
+ * Scan the existing ranges to see if any of them overlaps with
+ * [start, end). In that case, merge the two ranges instead of
+ * creating a new one.
+ * The number of ranges is less than 20, so there is no need to organize
+ * them into a more intelligent data structure.
+ */
+ for (int i = 0; i < future_index; i++) {
+ cstart = start_end_pairs[i].start;
+ cend = start_end_pairs[i].end;
+ if ((cstart < nstart && cend < nstart) ||
+ (cstart > nend && cend > nend))
+ /* ranges are disjoint - do not merge */
+ continue;
+ start_end_pairs[i].start = min(nstart, cstart);
+ start_end_pairs[i].end = max(nend, cend);
+ merged = true;
+ break;
+ }
+ if (merged)
+ return;
+ start_end_pairs[future_index].start = nstart;
+ start_end_pairs[future_index].end = nend;
+ future_index++;
+}
+
+/*
+ * Initialize the shadow for existing mappings during kernel initialization.
+ * These include kernel text/data sections, NODE_DATA and future ranges
+ * registered while creating other data (e.g. percpu).
+ *
+ * Allocations via memblock can be only done before slab is initialized.
+ */
+void __init kmsan_init_shadow(void)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ phys_addr_t p_start, p_end;
+ u64 loop;
+ int nid;
+
+ for_each_reserved_mem_range(loop, &p_start, &p_end)
+ kmsan_record_future_shadow_range(phys_to_virt(p_start),
+ phys_to_virt(p_end));
+ /* Allocate shadow for .data */
+ kmsan_record_future_shadow_range(_sdata, _edata);
+
+ for_each_online_node(nid)
+ kmsan_record_future_shadow_range(
+ NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size);
+
+ for (int i = 0; i < future_index; i++)
+ kmsan_init_alloc_meta_for_range(
+ (void *)start_end_pairs[i].start,
+ (void *)start_end_pairs[i].end);
+}
+
+struct metadata_page_pair {
+ struct page *shadow, *origin;
+};
+static struct metadata_page_pair held_back[MAX_ORDER] __initdata;
+
+/*
+ * Eager metadata allocation. When the memblock allocator is freeing pages to
+ * pagealloc, we use 2/3 of them as metadata for the remaining 1/3.
+ * We store the pointers to the returned blocks of pages in held_back[] grouped
+ * by their order: when kmsan_memblock_free_pages() is called for the first
+ * time with a certain order, it is reserved as a shadow block, for the second
+ * time - as an origin block. On the third time the incoming block receives its
+ * shadow and origin ranges from the previously saved shadow and origin blocks,
+ * after which held_back[order] can be used again.
+ *
+ * At the very end there may be leftover blocks in held_back[]. They are
+ * collected later by kmsan_memblock_discard().
+ */
+bool kmsan_memblock_free_pages(struct page *page, unsigned int order)
+{
+ struct page *shadow, *origin;
+
+ if (!held_back[order].shadow) {
+ held_back[order].shadow = page;
+ return false;
+ }
+ if (!held_back[order].origin) {
+ held_back[order].origin = page;
+ return false;
+ }
+ shadow = held_back[order].shadow;
+ origin = held_back[order].origin;
+ kmsan_setup_meta(page, shadow, origin, order);
+
+ held_back[order].shadow = NULL;
+ held_back[order].origin = NULL;
+ return true;
+}
+
+#define MAX_BLOCKS 8
+struct smallstack {
+ struct page *items[MAX_BLOCKS];
+ int index;
+ int order;
+};
+
+static struct smallstack collect = {
+ .index = 0,
+ .order = MAX_ORDER,
+};
+
+static void smallstack_push(struct smallstack *stack, struct page *pages)
+{
+ KMSAN_WARN_ON(stack->index == MAX_BLOCKS);
+ stack->items[stack->index] = pages;
+ stack->index++;
+}
+#undef MAX_BLOCKS
+
+static struct page *smallstack_pop(struct smallstack *stack)
+{
+ struct page *ret;
+
+ KMSAN_WARN_ON(stack->index == 0);
+ stack->index--;
+ ret = stack->items[stack->index];
+ stack->items[stack->index] = NULL;
+ return ret;
+}
+
+static void do_collection(void)
+{
+ struct page *page, *shadow, *origin;
+
+ while (collect.index >= 3) {
+ page = smallstack_pop(&collect);
+ shadow = smallstack_pop(&collect);
+ origin = smallstack_pop(&collect);
+ kmsan_setup_meta(page, shadow, origin, collect.order);
+ __free_pages_core(page, collect.order);
+ }
+}
+
+static void collect_split(void)
+{
+ struct smallstack tmp = {
+ .order = collect.order - 1,
+ .index = 0,
+ };
+ struct page *page;
+
+ if (!collect.order)
+ return;
+ while (collect.index) {
+ page = smallstack_pop(&collect);
+ smallstack_push(&tmp, &page[0]);
+ smallstack_push(&tmp, &page[1 << tmp.order]);
+ }
+ __memcpy(&collect, &tmp, sizeof(tmp));
+}
+
+/*
+ * Memblock is about to go away. Split the page blocks left over in held_back[]
+ * and return 1/3 of that memory to the system.
+ */
+static void kmsan_memblock_discard(void)
+{
+ /*
+ * For each order=N:
+ * - push held_back[N].shadow and .origin to @collect;
+ * - while there are >= 3 elements in @collect, do garbage collection:
+ * - pop 3 ranges from @collect;
+ * - use two of them as shadow and origin for the third one;
+ * - repeat;
+ * - split each remaining element from @collect into 2 ranges of
+ * order=N-1,
+ * - repeat.
+ */
+ collect.order = MAX_ORDER - 1;
+ for (int i = MAX_ORDER - 1; i >= 0; i--) {
+ if (held_back[i].shadow)
+ smallstack_push(&collect, held_back[i].shadow);
+ if (held_back[i].origin)
+ smallstack_push(&collect, held_back[i].origin);
+ held_back[i].shadow = NULL;
+ held_back[i].origin = NULL;
+ do_collection();
+ collect_split();
+ }
+}
+
+void __init kmsan_init_runtime(void)
+{
+ /* Assuming current is init_task */
+ kmsan_internal_task_create(current);
+ kmsan_memblock_discard();
+ pr_info("Starting KernelMemorySanitizer\n");
+ pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n");
+ kmsan_enabled = true;
+}
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
new file mode 100644
index 000000000000..770fe02904f3
--- /dev/null
+++ b/mm/kmsan/instrumentation.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN compiler API.
+ *
+ * This file implements __msan_XXX hooks that Clang inserts into the code
+ * compiled with -fsanitize=kernel-memory.
+ * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN
+ * instrumentation works.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include "kmsan.h"
+#include <linux/gfp.h>
+#include <linux/kmsan_string.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store)
+{
+ if ((u64)addr < TASK_SIZE)
+ return true;
+ if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW))
+ return true;
+ return false;
+}
+
+static inline struct shadow_origin_ptr
+get_shadow_origin_ptr(void *addr, u64 size, bool store)
+{
+ unsigned long ua_flags = user_access_save();
+ struct shadow_origin_ptr ret;
+
+ ret = kmsan_get_shadow_origin_ptr(addr, size, store);
+ user_access_restore(ua_flags);
+ return ret;
+}
+
+/* Get shadow and origin pointers for a memory load with non-standard size. */
+struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr,
+ uintptr_t size)
+{
+ return get_shadow_origin_ptr(addr, size, /*store*/ false);
+}
+EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n);
+
+/* Get shadow and origin pointers for a memory store with non-standard size. */
+struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr,
+ uintptr_t size)
+{
+ return get_shadow_origin_ptr(addr, size, /*store*/ true);
+}
+EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n);
+
+/*
+ * Declare functions that obtain shadow/origin pointers for loads and stores
+ * with fixed size.
+ */
+#define DECLARE_METADATA_PTR_GETTER(size) \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \
+ void *addr) \
+ { \
+ return get_shadow_origin_ptr(addr, size, /*store*/ false); \
+ } \
+ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \
+ void *addr) \
+ { \
+ return get_shadow_origin_ptr(addr, size, /*store*/ true); \
+ } \
+ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size)
+
+DECLARE_METADATA_PTR_GETTER(1);
+DECLARE_METADATA_PTR_GETTER(2);
+DECLARE_METADATA_PTR_GETTER(4);
+DECLARE_METADATA_PTR_GETTER(8);
+
+/*
+ * Handle a memory store performed by inline assembly. KMSAN conservatively
+ * attempts to unpoison the outputs of asm() directives to prevent false
+ * positives caused by missed stores.
+ *
+ * __msan_instrument_asm_store() may be called for inline assembly code when
+ * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure
+ * the memory written to in these cases is also marked as initialized.
+ */
+void __msan_instrument_asm_store(void *addr, uintptr_t size)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled)
+ return;
+
+ ua_flags = user_access_save();
+ /*
+ * Most of the accesses are below 32 bytes. The two exceptions so far
+ * are clwb() (64 bytes) and FPU state (512 bytes).
+ * It's unlikely that the assembly will touch more than 512 bytes.
+ */
+ if (size > 512) {
+ WARN_ONCE(1, "assembly store size too big: %ld\n", size);
+ size = 8;
+ }
+ if (is_bad_asm_addr(addr, size, /*is_store*/ true)) {
+ user_access_restore(ua_flags);
+ return;
+ }
+ /* Unpoisoning the memory on best effort. */
+ kmsan_internal_unpoison_memory(addr, size, /*checked*/ false);
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(__msan_instrument_asm_store);
+
+/*
+ * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset
+ * intrinsics with calls to respective __msan_ functions. We use
+ * get_param0_metadata() and set_retval_metadata() to store the shadow/origin
+ * values for the destination argument of these functions and use them for the
+ * functions' return values.
+ */
+static inline void get_param0_metadata(u64 *shadow,
+ depot_stack_handle_t *origin)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ *shadow = *(u64 *)(ctx->cstate.param_tls);
+ *origin = ctx->cstate.param_origin_tls[0];
+}
+
+static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ *(u64 *)(ctx->cstate.retval_tls) = shadow;
+ ctx->cstate.retval_origin_tls = origin;
+}
+
+/* Handle llvm.memmove intrinsic. */
+void *__msan_memmove(void *dst, const void *src, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memmove(dst, src, n);
+ if (!n)
+ /* Some people call memmove() with zero length. */
+ return result;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ kmsan_internal_memmove_metadata(dst, (void *)src, n);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memmove);
+
+/* Handle llvm.memcpy intrinsic. */
+void *__msan_memcpy(void *dst, const void *src, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memcpy(dst, src, n);
+ if (!n)
+ /* Some people call memcpy() with zero length. */
+ return result;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ /* Using memmove instead of memcpy doesn't affect correctness. */
+ kmsan_internal_memmove_metadata(dst, (void *)src, n);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memcpy);
+
+/* Handle llvm.memset intrinsic. */
+void *__msan_memset(void *dst, int c, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memset(dst, c, n);
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ /*
+ * Clang doesn't pass parameter metadata here, so it is impossible to
+ * use shadow of @c to set up the shadow for @dst.
+ */
+ kmsan_internal_unpoison_memory(dst, n, /*checked*/ false);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memset);
+
+/*
+ * Create a new origin from an old one. This is done when storing an
+ * uninitialized value to memory. When reporting an error, KMSAN unrolls and
+ * prints the whole chain of stores that preceded the use of this value.
+ */
+depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin)
+{
+ depot_stack_handle_t ret = 0;
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return ret;
+
+ ua_flags = user_access_save();
+
+ /* Creating new origins may allocate memory. */
+ kmsan_enter_runtime();
+ ret = kmsan_internal_chain_origin(origin);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+ return ret;
+}
+EXPORT_SYMBOL(__msan_chain_origin);
+
+/* Poison a local variable when entering a function. */
+void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
+{
+ depot_stack_handle_t handle;
+ unsigned long entries[4];
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN;
+ entries[1] = (u64)descr;
+ entries[2] = (u64)__builtin_return_address(0);
+ /*
+ * With frame pointers enabled, it is possible to quickly fetch the
+ * second frame of the caller stack without calling the unwinder.
+ * Without them, simply do not bother.
+ */
+ if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER))
+ entries[3] = (u64)__builtin_return_address(1);
+ else
+ entries[3] = 0;
+
+ /* stack_depot_save() may allocate memory. */
+ kmsan_enter_runtime();
+ handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC);
+ kmsan_leave_runtime();
+
+ kmsan_internal_set_shadow_origin(address, size, -1, handle,
+ /*checked*/ true);
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(__msan_poison_alloca);
+
+/* Unpoison a local variable. */
+void __msan_unpoison_alloca(void *address, uintptr_t size)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ kmsan_enter_runtime();
+ kmsan_internal_unpoison_memory(address, size, /*checked*/ true);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(__msan_unpoison_alloca);
+
+/*
+ * Report that an uninitialized value with the given origin was used in a way
+ * that constituted undefined behavior.
+ */
+void __msan_warning(u32 origin)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ kmsan_report(origin, /*address*/ 0, /*size*/ 0,
+ /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0,
+ REASON_ANY);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(__msan_warning);
+
+/*
+ * At the beginning of an instrumented function, obtain the pointer to
+ * `struct kmsan_context_state` holding the metadata for function parameters.
+ */
+struct kmsan_context_state *__msan_get_context_state(void)
+{
+ return &kmsan_get_context()->cstate;
+}
+EXPORT_SYMBOL(__msan_get_context_state);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
new file mode 100644
index 000000000000..a14744205435
--- /dev/null
+++ b/mm/kmsan/kmsan.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Functions used by the KMSAN runtime.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef __MM_KMSAN_KMSAN_H
+#define __MM_KMSAN_KMSAN_H
+
+#include <asm/pgtable_64_types.h>
+#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+
+#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100
+#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200
+
+#define KMSAN_POISON_NOCHECK 0x0
+#define KMSAN_POISON_CHECK 0x1
+#define KMSAN_POISON_FREE 0x2
+
+#define KMSAN_ORIGIN_SIZE 4
+#define KMSAN_MAX_ORIGIN_DEPTH 7
+
+#define KMSAN_STACK_DEPTH 64
+
+#define KMSAN_META_SHADOW (false)
+#define KMSAN_META_ORIGIN (true)
+
+extern bool kmsan_enabled;
+extern int panic_on_kmsan;
+
+/*
+ * KMSAN performs a lot of consistency checks that are currently enabled by
+ * default. BUG_ON is normally discouraged in the kernel, unless used for
+ * debugging, but KMSAN itself is a debugging tool, so it makes little sense to
+ * recover if something goes wrong.
+ */
+#define KMSAN_WARN_ON(cond) \
+ ({ \
+ const bool __cond = WARN_ON(cond); \
+ if (unlikely(__cond)) { \
+ WRITE_ONCE(kmsan_enabled, false); \
+ if (panic_on_kmsan) { \
+ /* Can't call panic() here because */ \
+ /* of uaccess checks. */ \
+ BUG(); \
+ } \
+ } \
+ __cond; \
+ })
+
+/*
+ * A pair of metadata pointers to be returned by the instrumentation functions.
+ */
+struct shadow_origin_ptr {
+ void *shadow, *origin;
+};
+
+struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size,
+ bool store);
+void *kmsan_get_metadata(void *addr, bool is_origin);
+void __init kmsan_init_alloc_meta_for_range(void *start, void *end);
+
+enum kmsan_bug_reason {
+ REASON_ANY,
+ REASON_COPY_TO_USER,
+ REASON_SUBMIT_URB,
+};
+
+void kmsan_print_origin(depot_stack_handle_t origin);
+
+/**
+ * kmsan_report() - Report a use of uninitialized value.
+ * @origin: Stack ID of the uninitialized value.
+ * @address: Address at which the memory access happens.
+ * @size: Memory access size.
+ * @off_first: Offset (from @address) of the first byte to be reported.
+ * @off_last: Offset (from @address) of the last byte to be reported.
+ * @user_addr: When non-NULL, denotes the userspace address to which the kernel
+ * is leaking data.
+ * @reason: Error type from enum kmsan_bug_reason.
+ *
+ * kmsan_report() prints an error message for a consequent group of bytes
+ * sharing the same origin. If an uninitialized value is used in a comparison,
+ * this function is called once without specifying the addresses. When checking
+ * a memory range, KMSAN may call kmsan_report() multiple times with the same
+ * @address, @size, @user_addr and @reason, but different @off_first and
+ * @off_last corresponding to different @origin values.
+ */
+void kmsan_report(depot_stack_handle_t origin, void *address, int size,
+ int off_first, int off_last, const void *user_addr,
+ enum kmsan_bug_reason reason);
+
+DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+static __always_inline struct kmsan_ctx *kmsan_get_context(void)
+{
+ return in_task() ? &current->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx);
+}
+
+/*
+ * When a compiler hook or KMSAN runtime function is invoked, it may make a
+ * call to instrumented code and eventually call itself recursively. To avoid
+ * that, we guard the runtime entry regions with
+ * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if
+ * kmsan_in_runtime() is true.
+ *
+ * Non-runtime code may occasionally get executed in nested IRQs from the
+ * runtime code (e.g. when called via smp_call_function_single()). Because some
+ * KMSAN routines may take locks (e.g. for memory allocation), we conservatively
+ * bail out instead of calling them. To minimize the effect of this (potentially
+ * missing initialization events) kmsan_in_runtime() is not checked in
+ * non-blocking runtime functions.
+ */
+static __always_inline bool kmsan_in_runtime(void)
+{
+ if ((hardirq_count() >> HARDIRQ_SHIFT) > 1)
+ return true;
+ if (in_nmi())
+ return true;
+ return kmsan_get_context()->kmsan_in_runtime;
+}
+
+static __always_inline void kmsan_enter_runtime(void)
+{
+ struct kmsan_ctx *ctx;
+
+ ctx = kmsan_get_context();
+ KMSAN_WARN_ON(ctx->kmsan_in_runtime++);
+}
+
+static __always_inline void kmsan_leave_runtime(void)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
+}
+
+depot_stack_handle_t kmsan_save_stack(void);
+depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
+ unsigned int extra_bits);
+
+/*
+ * Pack and unpack the origin chain depth and UAF flag to/from the extra bits
+ * provided by the stack depot.
+ * The UAF flag is stored in the lowest bit, followed by the depth in the upper
+ * bits.
+ * set_dsh_extra_bits() is responsible for clamping the value.
+ */
+static __always_inline unsigned int kmsan_extra_bits(unsigned int depth,
+ bool uaf)
+{
+ return (depth << 1) | uaf;
+}
+
+static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits)
+{
+ return extra_bits & 1;
+}
+
+static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits)
+{
+ return extra_bits >> 1;
+}
+
+/*
+ * kmsan_internal_ functions are supposed to be very simple and not require the
+ * kmsan_in_runtime() checks.
+ */
+void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n);
+void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags,
+ unsigned int poison_flags);
+void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked);
+void kmsan_internal_set_shadow_origin(void *address, size_t size, int b,
+ u32 origin, bool checked);
+depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id);
+
+void kmsan_internal_task_create(struct task_struct *task);
+
+bool kmsan_metadata_is_contiguous(void *addr, size_t size);
+void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr,
+ int reason);
+
+struct page *kmsan_vmalloc_to_page_or_null(void *vaddr);
+void kmsan_setup_meta(struct page *page, struct page *shadow,
+ struct page *origin, int order);
+
+/*
+ * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are
+ * non-instrumented versions of is_module_address() and is_vmalloc_addr() that
+ * are safe to call from KMSAN runtime without recursion.
+ */
+static inline bool kmsan_internal_is_module_addr(void *vaddr)
+{
+ return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END);
+}
+
+static inline bool kmsan_internal_is_vmalloc_addr(void *addr)
+{
+ return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END);
+}
+
+#endif /* __MM_KMSAN_KMSAN_H */
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
new file mode 100644
index 000000000000..088e21a48dc4
--- /dev/null
+++ b/mm/kmsan/kmsan_test.c
@@ -0,0 +1,585 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KMSAN.
+ * For each test case checks the presence (or absence) of generated reports.
+ * Relies on 'console' tracepoint to capture reports as they appear in the
+ * kernel log.
+ *
+ * Copyright (C) 2021-2022, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <kunit/test.h>
+#include "kmsan.h"
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kmsan.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <linux/vmalloc.h>
+#include <trace/events/printk.h>
+
+static DEFINE_PER_CPU(int, per_cpu_var);
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ bool available;
+ bool ignore; /* Stop console output collection. */
+ char header[256];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+
+ if (observed.ignore)
+ return;
+ spin_lock_irqsave(&observed.lock, flags);
+
+ if (strnstr(buf, "BUG: KMSAN: ", len)) {
+ /*
+ * KMSAN report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.header, buf,
+ min(len + 1, sizeof(observed.header)));
+ WRITE_ONCE(observed.available, true);
+ observed.ignore = true;
+ }
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+ return READ_ONCE(observed.available);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+ const char *error_type; /* Error type. */
+ /*
+ * Kernel symbol from the error header, or NULL if no report is
+ * expected.
+ */
+ const char *symbol;
+};
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+ typeof(observed.header) expected_header;
+ unsigned long flags;
+ bool ret = false;
+ const char *end;
+ char *cur;
+
+ /* Doubled-checked locking. */
+ if (!report_available() || !r->symbol)
+ return (!report_available() && !r->symbol);
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expected_header;
+ end = &expected_header[sizeof(expected_header) - 1];
+
+ cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type);
+
+ scnprintf(cur, end - cur, " in %s", r->symbol);
+ /* The exact offset won't match, remove it; also strip module name. */
+ cur = strchr(expected_header, '+');
+ if (cur)
+ *cur = '\0';
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.header, expected_header);
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ return ret;
+}
+
+/* ===== Test cases ===== */
+
+/* Prevent replacing branch with select in LLVM. */
+static noinline void check_true(char *arg)
+{
+ pr_info("%s is true\n", arg);
+}
+
+static noinline void check_false(char *arg)
+{
+ pr_info("%s is false\n", arg);
+}
+
+#define USE(x) \
+ do { \
+ if (x) \
+ check_true(#x); \
+ else \
+ check_false(#x); \
+ } while (0)
+
+#define EXPECTATION_ETYPE_FN(e, reason, fn) \
+ struct expect_report e = { \
+ .error_type = reason, \
+ .symbol = fn, \
+ }
+
+#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL)
+#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \
+ EXPECTATION_ETYPE_FN(e, "uninit-value", fn)
+#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__)
+#define EXPECTATION_USE_AFTER_FREE(e) \
+ EXPECTATION_ETYPE_FN(e, "use-after-free", __func__)
+
+/* Test case: ensure that kmalloc() returns uninitialized memory. */
+static void test_uninit_kmalloc(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ int *ptr;
+
+ kunit_info(test, "uninitialized kmalloc test (UMR report)\n");
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that kmalloc'ed memory becomes initialized after memset().
+ */
+static void test_init_kmalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int *ptr;
+
+ kunit_info(test, "initialized kmalloc test (no reports)\n");
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ memset(ptr, 0, sizeof(*ptr));
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that kzalloc() returns initialized memory. */
+static void test_init_kzalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int *ptr;
+
+ kunit_info(test, "initialized kzalloc test (no reports)\n");
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that local variables are uninitialized by default. */
+static void test_uninit_stack_var(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile int cond;
+
+ kunit_info(test, "uninitialized stack variable (UMR report)\n");
+ USE(cond);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that local variables with initializers are initialized. */
+static void test_init_stack_var(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ volatile int cond = 1;
+
+ kunit_info(test, "initialized stack variable (no reports)\n");
+ USE(cond);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static noinline void two_param_fn_2(int arg1, int arg2)
+{
+ USE(arg1);
+ USE(arg2);
+}
+
+static noinline void one_param_fn(int arg)
+{
+ two_param_fn_2(arg, arg);
+ USE(arg);
+}
+
+static noinline void two_param_fn(int arg1, int arg2)
+{
+ int init = 0;
+
+ one_param_fn(init);
+ USE(arg1);
+ USE(arg2);
+}
+
+static void test_params(struct kunit *test)
+{
+#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL
+ /*
+ * With eager param/retval checking enabled, KMSAN will report an error
+ * before the call to two_param_fn().
+ */
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_params");
+#else
+ EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn");
+#endif
+ volatile int uninit, init = 1;
+
+ kunit_info(test,
+ "uninit passed through a function parameter (UMR report)\n");
+ two_param_fn(uninit, init);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static int signed_sum3(int a, int b, int c)
+{
+ return a + b + c;
+}
+
+/*
+ * Test case: ensure that uninitialized values are tracked through function
+ * arguments.
+ */
+static void test_uninit_multiple_params(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile char b = 3, c;
+ volatile int a;
+
+ kunit_info(test, "uninitialized local passed to fn (UMR report)\n");
+ USE(signed_sum3(a, b, c));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Helper function to make an array uninitialized. */
+static noinline void do_uninit_local_array(char *array, int start, int stop)
+{
+ volatile char uninit;
+
+ for (int i = start; i < stop; i++)
+ array[i] = uninit;
+}
+
+/*
+ * Test case: ensure kmsan_check_memory() reports an error when checking
+ * uninitialized memory.
+ */
+static void test_uninit_kmsan_check_memory(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory");
+ volatile char local_array[8];
+
+ kunit_info(
+ test,
+ "kmsan_check_memory() called on uninit local (UMR report)\n");
+ do_uninit_local_array((char *)local_array, 5, 7);
+
+ kmsan_check_memory((char *)local_array, 8);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: check that a virtual memory range created with vmap() from
+ * initialized pages is still considered as initialized.
+ */
+static void test_init_kmsan_vmap_vunmap(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ const int npages = 2;
+ struct page **pages;
+ void *vbuf;
+
+ kunit_info(test, "pages initialized via vmap (no reports)\n");
+
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ for (int i = 0; i < npages; i++)
+ pages[i] = alloc_page(GFP_KERNEL);
+ vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
+ memset(vbuf, 0xfe, npages * PAGE_SIZE);
+ for (int i = 0; i < npages; i++)
+ kmsan_check_memory(page_address(pages[i]), PAGE_SIZE);
+
+ if (vbuf)
+ vunmap(vbuf);
+ for (int i = 0; i < npages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memset() can initialize a buffer allocated via
+ * vmalloc().
+ */
+static void test_init_vmalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int npages = 8;
+ char *buf;
+
+ kunit_info(test, "vmalloc buffer can be initialized (no reports)\n");
+ buf = vmalloc(PAGE_SIZE * npages);
+ buf[0] = 1;
+ memset(buf, 0xfe, PAGE_SIZE * npages);
+ USE(buf[0]);
+ for (int i = 0; i < npages; i++)
+ kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE);
+ vfree(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that use-after-free reporting works. */
+static void test_uaf(struct kunit *test)
+{
+ EXPECTATION_USE_AFTER_FREE(expect);
+ volatile int value;
+ volatile int *var;
+
+ kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n");
+ var = kmalloc(80, GFP_KERNEL);
+ var[3] = 0xfeedface;
+ kfree((int *)var);
+ /* Copy the invalid value before checking it. */
+ value = var[3];
+ USE(value);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that uninitialized values are propagated through per-CPU
+ * memory.
+ */
+static void test_percpu_propagate(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile int uninit, check;
+
+ kunit_info(test,
+ "uninit local stored to per_cpu memory (UMR report)\n");
+
+ this_cpu_write(per_cpu_var, uninit);
+ check = this_cpu_read(per_cpu_var);
+ USE(check);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that passing uninitialized values to printk() leads to an
+ * error report.
+ */
+static void test_printk(struct kunit *test)
+{
+#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL
+ /*
+ * With eager param/retval checking enabled, KMSAN will report an error
+ * before the call to pr_info().
+ */
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk");
+#else
+ EXPECTATION_UNINIT_VALUE_FN(expect, "number");
+#endif
+ volatile int uninit;
+
+ kunit_info(test, "uninit local passed to pr_info() (UMR report)\n");
+ pr_info("%px contains %d\n", &uninit, uninit);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and `dst`.
+ */
+static void test_memcpy_aligned_to_aligned(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned");
+ volatile int uninit_src;
+ volatile int dst = 0;
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to aligned dst (UMR report)\n");
+ OPTIMIZER_HIDE_VAR(uninit_src);
+ memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&dst, sizeof(dst));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and unaligned `dst`.
+ *
+ * Copying aligned 4-byte value to an unaligned one leads to touching two
+ * aligned 4-byte values. This test case checks that KMSAN correctly reports an
+ * error on the first of the two values.
+ */
+static void test_memcpy_aligned_to_unaligned(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned");
+ volatile int uninit_src;
+ volatile char dst[8] = { 0 };
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n");
+ OPTIMIZER_HIDE_VAR(uninit_src);
+ memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)dst, 4);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and unaligned `dst`.
+ *
+ * Copying aligned 4-byte value to an unaligned one leads to touching two
+ * aligned 4-byte values. This test case checks that KMSAN correctly reports an
+ * error on the second of the two values.
+ */
+static void test_memcpy_aligned_to_unaligned2(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect,
+ "test_memcpy_aligned_to_unaligned2");
+ volatile int uninit_src;
+ volatile char dst[8] = { 0 };
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n");
+ OPTIMIZER_HIDE_VAR(uninit_src);
+ memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&dst[4], sizeof(uninit_src));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static noinline void fibonacci(int *array, int size, int start) {
+ if (start < 2 || (start == size))
+ return;
+ array[start] = array[start - 1] + array[start - 2];
+ fibonacci(array, size, start + 1);
+}
+
+static void test_long_origin_chain(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect,
+ "test_long_origin_chain");
+ /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */
+ volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2];
+ int last = ARRAY_SIZE(accum) - 1;
+
+ kunit_info(
+ test,
+ "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n");
+ /*
+ * We do not set accum[1] to 0, so the uninitializedness will be carried
+ * over to accum[2..last].
+ */
+ accum[0] = 1;
+ fibonacci((int *)accum, ARRAY_SIZE(accum), 2);
+ kmsan_check_memory((void *)&accum[last], sizeof(int));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static struct kunit_case kmsan_test_cases[] = {
+ KUNIT_CASE(test_uninit_kmalloc),
+ KUNIT_CASE(test_init_kmalloc),
+ KUNIT_CASE(test_init_kzalloc),
+ KUNIT_CASE(test_uninit_stack_var),
+ KUNIT_CASE(test_init_stack_var),
+ KUNIT_CASE(test_params),
+ KUNIT_CASE(test_uninit_multiple_params),
+ KUNIT_CASE(test_uninit_kmsan_check_memory),
+ KUNIT_CASE(test_init_kmsan_vmap_vunmap),
+ KUNIT_CASE(test_init_vmalloc),
+ KUNIT_CASE(test_uaf),
+ KUNIT_CASE(test_percpu_propagate),
+ KUNIT_CASE(test_printk),
+ KUNIT_CASE(test_memcpy_aligned_to_aligned),
+ KUNIT_CASE(test_memcpy_aligned_to_unaligned),
+ KUNIT_CASE(test_memcpy_aligned_to_unaligned2),
+ KUNIT_CASE(test_long_origin_chain),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ observed.header[0] = '\0';
+ observed.ignore = false;
+ observed.available = false;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+}
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+static int kmsan_suite_init(struct kunit_suite *suite)
+{
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
+ return 0;
+}
+
+static void kmsan_suite_exit(struct kunit_suite *suite)
+{
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static struct kunit_suite kmsan_test_suite = {
+ .name = "kmsan",
+ .test_cases = kmsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kmsan_suite_init,
+ .suite_exit = kmsan_suite_exit,
+};
+kunit_test_suites(&kmsan_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
new file mode 100644
index 000000000000..02736ec757f2
--- /dev/null
+++ b/mm/kmsan/report.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN error reporting routines.
+ *
+ * Copyright (C) 2019-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <linux/console.h>
+#include <linux/moduleparam.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/uaccess.h>
+
+#include "kmsan.h"
+
+static DEFINE_RAW_SPINLOCK(kmsan_report_lock);
+#define DESCR_SIZE 128
+/* Protected by kmsan_report_lock */
+static char report_local_descr[DESCR_SIZE];
+int panic_on_kmsan __read_mostly;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kmsan."
+module_param_named(panic, panic_on_kmsan, int, 0);
+
+/*
+ * Skip internal KMSAN frames.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[],
+ int num_entries)
+{
+ int len, skip;
+ char buf[64];
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ len = scnprintf(buf, sizeof(buf), "%ps",
+ (void *)stack_entries[skip]);
+
+ /* Never show __msan_* or kmsan_* functions. */
+ if ((strnstr(buf, "__msan_", len) == buf) ||
+ (strnstr(buf, "kmsan_", len) == buf))
+ continue;
+
+ /*
+ * No match for runtime functions -- @skip entries to skip to
+ * get to first frame of interest.
+ */
+ break;
+ }
+
+ return skip;
+}
+
+/*
+ * Currently the descriptions of locals generated by Clang look as follows:
+ * ----local_name@function_name
+ * We want to print only the name of the local, as other information in that
+ * description can be confusing.
+ * The meaningful part of the description is copied to a global buffer to avoid
+ * allocating memory.
+ */
+static char *pretty_descr(char *descr)
+{
+ int pos = 0, len = strlen(descr);
+
+ for (int i = 0; i < len; i++) {
+ if (descr[i] == '@')
+ break;
+ if (descr[i] == '-')
+ continue;
+ report_local_descr[pos] = descr[i];
+ if (pos + 1 == DESCR_SIZE)
+ break;
+ pos++;
+ }
+ report_local_descr[pos] = 0;
+ return report_local_descr;
+}
+
+void kmsan_print_origin(depot_stack_handle_t origin)
+{
+ unsigned long *entries = NULL, *chained_entries = NULL;
+ unsigned int nr_entries, chained_nr_entries, skipnr;
+ void *pc1 = NULL, *pc2 = NULL;
+ depot_stack_handle_t head;
+ unsigned long magic;
+ char *descr = NULL;
+ unsigned int depth;
+
+ if (!origin)
+ return;
+
+ while (true) {
+ nr_entries = stack_depot_fetch(origin, &entries);
+ depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin));
+ magic = nr_entries ? entries[0] : 0;
+ if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) {
+ descr = (char *)entries[1];
+ pc1 = (void *)entries[2];
+ pc2 = (void *)entries[3];
+ pr_err("Local variable %s created at:\n",
+ pretty_descr(descr));
+ if (pc1)
+ pr_err(" %pSb\n", pc1);
+ if (pc2)
+ pr_err(" %pSb\n", pc2);
+ break;
+ }
+ if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) {
+ /*
+ * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are
+ * not stored, so the output may be incomplete.
+ */
+ if (depth == KMSAN_MAX_ORIGIN_DEPTH)
+ pr_err("<Zero or more stacks not recorded to save memory>\n\n");
+ head = entries[1];
+ origin = entries[2];
+ pr_err("Uninit was stored to memory at:\n");
+ chained_nr_entries =
+ stack_depot_fetch(head, &chained_entries);
+ kmsan_internal_unpoison_memory(
+ chained_entries,
+ chained_nr_entries * sizeof(*chained_entries),
+ /*checked*/ false);
+ skipnr = get_stack_skipnr(chained_entries,
+ chained_nr_entries);
+ stack_trace_print(chained_entries + skipnr,
+ chained_nr_entries - skipnr, 0);
+ pr_err("\n");
+ continue;
+ }
+ pr_err("Uninit was created at:\n");
+ if (nr_entries) {
+ skipnr = get_stack_skipnr(entries, nr_entries);
+ stack_trace_print(entries + skipnr, nr_entries - skipnr,
+ 0);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+ break;
+ }
+}
+
+void kmsan_report(depot_stack_handle_t origin, void *address, int size,
+ int off_first, int off_last, const void *user_addr,
+ enum kmsan_bug_reason reason)
+{
+ unsigned long stack_entries[KMSAN_STACK_DEPTH];
+ int num_stack_entries, skipnr;
+ char *bug_type = NULL;
+ unsigned long ua_flags;
+ bool is_uaf;
+
+ if (!kmsan_enabled)
+ return;
+ if (!current->kmsan_ctx.allow_reporting)
+ return;
+ if (!origin)
+ return;
+
+ current->kmsan_ctx.allow_reporting = false;
+ ua_flags = user_access_save();
+ raw_spin_lock(&kmsan_report_lock);
+ pr_err("=====================================================\n");
+ is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin));
+ switch (reason) {
+ case REASON_ANY:
+ bug_type = is_uaf ? "use-after-free" : "uninit-value";
+ break;
+ case REASON_COPY_TO_USER:
+ bug_type = is_uaf ? "kernel-infoleak-after-free" :
+ "kernel-infoleak";
+ break;
+ case REASON_SUBMIT_URB:
+ bug_type = is_uaf ? "kernel-usb-infoleak-after-free" :
+ "kernel-usb-infoleak";
+ break;
+ }
+
+ num_stack_entries =
+ stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1);
+ skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+
+ pr_err("BUG: KMSAN: %s in %pSb\n", bug_type,
+ (void *)stack_entries[skipnr]);
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
+ 0);
+ pr_err("\n");
+
+ kmsan_print_origin(origin);
+
+ if (size) {
+ pr_err("\n");
+ if (off_first == off_last)
+ pr_err("Byte %d of %d is uninitialized\n", off_first,
+ size);
+ else
+ pr_err("Bytes %d-%d of %d are uninitialized\n",
+ off_first, off_last, size);
+ }
+ if (address)
+ pr_err("Memory access of size %d starts at %px\n", size,
+ address);
+ if (user_addr && reason == REASON_COPY_TO_USER)
+ pr_err("Data copied to user address %px\n", user_addr);
+ pr_err("\n");
+ dump_stack_print_info(KERN_ERR);
+ pr_err("=====================================================\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ raw_spin_unlock(&kmsan_report_lock);
+ if (panic_on_kmsan)
+ panic("kmsan.panic set ...\n");
+ user_access_restore(ua_flags);
+ current->kmsan_ctx.allow_reporting = true;
+}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
new file mode 100644
index 000000000000..a787c04e9583
--- /dev/null
+++ b/mm/kmsan/shadow.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN shadow implementation.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <asm/kmsan.h>
+#include <asm/tlbflush.h>
+#include <linux/cacheflush.h>
+#include <linux/memblock.h>
+#include <linux/mm_types.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+
+#include "../internal.h"
+#include "kmsan.h"
+
+#define shadow_page_for(page) ((page)->kmsan_shadow)
+
+#define origin_page_for(page) ((page)->kmsan_origin)
+
+static void *shadow_ptr_for(struct page *page)
+{
+ return page_address(shadow_page_for(page));
+}
+
+static void *origin_ptr_for(struct page *page)
+{
+ return page_address(origin_page_for(page));
+}
+
+static bool page_has_metadata(struct page *page)
+{
+ return shadow_page_for(page) && origin_page_for(page);
+}
+
+static void set_no_shadow_origin_page(struct page *page)
+{
+ shadow_page_for(page) = NULL;
+ origin_page_for(page) = NULL;
+}
+
+/*
+ * Dummy load and store pages to be used when the real metadata is unavailable.
+ * There are separate pages for loads and stores, so that every load returns a
+ * zero, and every store doesn't affect other loads.
+ */
+static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static unsigned long vmalloc_meta(void *addr, bool is_origin)
+{
+ unsigned long addr64 = (unsigned long)addr, off;
+
+ KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE));
+ if (kmsan_internal_is_vmalloc_addr(addr)) {
+ off = addr64 - VMALLOC_START;
+ return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START :
+ KMSAN_VMALLOC_SHADOW_START);
+ }
+ if (kmsan_internal_is_module_addr(addr)) {
+ off = addr64 - MODULES_VADDR;
+ return off + (is_origin ? KMSAN_MODULES_ORIGIN_START :
+ KMSAN_MODULES_SHADOW_START);
+ }
+ return 0;
+}
+
+static struct page *virt_to_page_or_null(void *vaddr)
+{
+ if (kmsan_virt_addr_valid(vaddr))
+ return virt_to_page(vaddr);
+ else
+ return NULL;
+}
+
+struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size,
+ bool store)
+{
+ struct shadow_origin_ptr ret;
+ void *shadow;
+
+ /*
+ * Even if we redirect this memory access to the dummy page, it will
+ * go out of bounds.
+ */
+ KMSAN_WARN_ON(size > PAGE_SIZE);
+
+ if (!kmsan_enabled)
+ goto return_dummy;
+
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size));
+ shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW);
+ if (!shadow)
+ goto return_dummy;
+
+ ret.shadow = shadow;
+ ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN);
+ return ret;
+
+return_dummy:
+ if (store) {
+ /* Ignore this store. */
+ ret.shadow = dummy_store_page;
+ ret.origin = dummy_store_page;
+ } else {
+ /* This load will return zero. */
+ ret.shadow = dummy_load_page;
+ ret.origin = dummy_load_page;
+ }
+ return ret;
+}
+
+/*
+ * Obtain the shadow or origin pointer for the given address, or NULL if there's
+ * none. The caller must check the return value for being non-NULL if needed.
+ * The return value of this function should not depend on whether we're in the
+ * runtime or not.
+ */
+void *kmsan_get_metadata(void *address, bool is_origin)
+{
+ u64 addr = (u64)address, pad, off;
+ struct page *page;
+ void *ret;
+
+ if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) {
+ pad = addr % KMSAN_ORIGIN_SIZE;
+ addr -= pad;
+ }
+ address = (void *)addr;
+ if (kmsan_internal_is_vmalloc_addr(address) ||
+ kmsan_internal_is_module_addr(address))
+ return (void *)vmalloc_meta(address, is_origin);
+
+ ret = arch_kmsan_get_meta_or_null(address, is_origin);
+ if (ret)
+ return ret;
+
+ page = virt_to_page_or_null(address);
+ if (!page)
+ return NULL;
+ if (!page_has_metadata(page))
+ return NULL;
+ off = addr % PAGE_SIZE;
+
+ return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off;
+}
+
+void kmsan_copy_page_meta(struct page *dst, struct page *src)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ if (!dst || !page_has_metadata(dst))
+ return;
+ if (!src || !page_has_metadata(src)) {
+ kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE,
+ /*checked*/ false);
+ return;
+ }
+
+ kmsan_enter_runtime();
+ __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE);
+ __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_copy_page_meta);
+
+void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags)
+{
+ bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled;
+ struct page *shadow, *origin;
+ depot_stack_handle_t handle;
+ int pages = 1 << order;
+
+ if (!page)
+ return;
+
+ shadow = shadow_page_for(page);
+ origin = origin_page_for(page);
+
+ if (initialized) {
+ __memset(page_address(shadow), 0, PAGE_SIZE * pages);
+ __memset(page_address(origin), 0, PAGE_SIZE * pages);
+ return;
+ }
+
+ /* Zero pages allocated by the runtime should also be initialized. */
+ if (kmsan_in_runtime())
+ return;
+
+ __memset(page_address(shadow), -1, PAGE_SIZE * pages);
+ kmsan_enter_runtime();
+ handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0);
+ kmsan_leave_runtime();
+ /*
+ * Addresses are page-aligned, pages are contiguous, so it's ok
+ * to just fill the origin pages with @handle.
+ */
+ for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++)
+ ((depot_stack_handle_t *)page_address(origin))[i] = handle;
+}
+
+void kmsan_free_page(struct page *page, unsigned int order)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ kmsan_internal_poison_memory(page_address(page),
+ PAGE_SIZE << compound_order(page),
+ GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift)
+{
+ unsigned long shadow_start, origin_start, shadow_end, origin_end;
+ struct page **s_pages, **o_pages;
+ int nr, mapped;
+
+ if (!kmsan_enabled)
+ return;
+
+ shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW);
+ shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW);
+ if (!shadow_start)
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
+ o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
+ if (!s_pages || !o_pages)
+ goto ret;
+ for (int i = 0; i < nr; i++) {
+ s_pages[i] = shadow_page_for(pages[i]);
+ o_pages[i] = origin_page_for(pages[i]);
+ }
+ prot = __pgprot(pgprot_val(prot) | _PAGE_NX);
+ prot = PAGE_KERNEL;
+
+ origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN);
+ origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN);
+ kmsan_enter_runtime();
+ mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
+ s_pages, page_shift);
+ KMSAN_WARN_ON(mapped);
+ mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
+ o_pages, page_shift);
+ KMSAN_WARN_ON(mapped);
+ kmsan_leave_runtime();
+ flush_tlb_kernel_range(shadow_start, shadow_end);
+ flush_tlb_kernel_range(origin_start, origin_end);
+ flush_cache_vmap(shadow_start, shadow_end);
+ flush_cache_vmap(origin_start, origin_end);
+
+ret:
+ kfree(s_pages);
+ kfree(o_pages);
+}
+
+/* Allocate metadata for pages allocated at boot time. */
+void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
+{
+ struct page *shadow_p, *origin_p;
+ void *shadow, *origin;
+ struct page *page;
+ u64 size;
+
+ start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE);
+ size = ALIGN((u64)end - (u64)start, PAGE_SIZE);
+ shadow = memblock_alloc(size, PAGE_SIZE);
+ origin = memblock_alloc(size, PAGE_SIZE);
+ for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
+ page = virt_to_page_or_null((char *)start + addr);
+ shadow_p = virt_to_page_or_null((char *)shadow + addr);
+ set_no_shadow_origin_page(shadow_p);
+ shadow_page_for(page) = shadow_p;
+ origin_p = virt_to_page_or_null((char *)origin + addr);
+ set_no_shadow_origin_page(origin_p);
+ origin_page_for(page) = origin_p;
+ }
+}
+
+void kmsan_setup_meta(struct page *page, struct page *shadow,
+ struct page *origin, int order)
+{
+ for (int i = 0; i < (1 << order); i++) {
+ set_no_shadow_origin_page(&shadow[i]);
+ set_no_shadow_origin_page(&origin[i]);
+ shadow_page_for(&page[i]) = &shadow[i];
+ origin_page_for(&page[i]) = &origin[i];
+ }
+}
diff --git a/mm/ksm.c b/mm/ksm.c
index 42ab153335a2..dd02780c387f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -39,9 +39,11 @@
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/pagewalk.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#include "mm_slot.h"
#ifdef CONFIG_NUMA
#define NUMA(x) (x)
@@ -82,7 +84,7 @@
* different KSM page copy of that content
*
* Internally, the regular nodes, "dups" and "chains" are represented
- * using the same struct stable_node structure.
+ * using the same struct ksm_stable_node structure.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
@@ -112,17 +114,13 @@
*/
/**
- * struct mm_slot - ksm information per mm that is being scanned
- * @link: link to the mm_slots hash list
- * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * struct ksm_mm_slot - ksm information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
- * @mm: the mm that this information is valid for
*/
-struct mm_slot {
- struct hlist_node link;
- struct list_head mm_list;
- struct rmap_item *rmap_list;
- struct mm_struct *mm;
+struct ksm_mm_slot {
+ struct mm_slot slot;
+ struct ksm_rmap_item *rmap_list;
};
/**
@@ -135,14 +133,14 @@ struct mm_slot {
* There is only the one ksm_scan instance of this cursor structure.
*/
struct ksm_scan {
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
unsigned long address;
- struct rmap_item **rmap_list;
+ struct ksm_rmap_item **rmap_list;
unsigned long seqnr;
};
/**
- * struct stable_node - node of the stable rbtree
+ * struct ksm_stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
* @hlist_dup: linked into the stable_node->hlist with a stable_node chain
@@ -153,7 +151,7 @@ struct ksm_scan {
* @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
-struct stable_node {
+struct ksm_stable_node {
union {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
@@ -182,7 +180,7 @@ struct stable_node {
};
/**
- * struct rmap_item - reverse mapping item for virtual addresses
+ * struct ksm_rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
* @nid: NUMA node id of unstable tree in which linked (may not match page)
@@ -193,8 +191,8 @@ struct stable_node {
* @head: pointer to stable_node heading this list in the stable tree
* @hlist: link into hlist of rmap_items hanging off that stable_node
*/
-struct rmap_item {
- struct rmap_item *rmap_list;
+struct ksm_rmap_item {
+ struct ksm_rmap_item *rmap_list;
union {
struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
@@ -207,7 +205,7 @@ struct rmap_item {
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
- struct stable_node *head;
+ struct ksm_stable_node *head;
struct hlist_node hlist;
};
};
@@ -230,8 +228,8 @@ static LIST_HEAD(migrate_nodes);
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
-static struct mm_slot ksm_mm_head = {
- .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+static struct ksm_mm_slot ksm_mm_head = {
+ .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
};
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
@@ -298,21 +296,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
-#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
static int __init ksm_slab_init(void)
{
- rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
if (!rmap_item_cache)
goto out;
- stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
if (!stable_node_cache)
goto out_free1;
- mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
if (!mm_slot_cache)
goto out_free2;
@@ -334,18 +332,18 @@ static void __init ksm_slab_free(void)
mm_slot_cache = NULL;
}
-static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
{
return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}
-static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
{
return dup->head == STABLE_NODE_DUP_HEAD;
}
-static inline void stable_node_chain_add_dup(struct stable_node *dup,
- struct stable_node *chain)
+static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
+ struct ksm_stable_node *chain)
{
VM_BUG_ON(is_stable_node_dup(dup));
dup->head = STABLE_NODE_DUP_HEAD;
@@ -354,14 +352,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup,
ksm_stable_node_dups++;
}
-static inline void __stable_node_dup_del(struct stable_node *dup)
+static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
{
VM_BUG_ON(!is_stable_node_dup(dup));
hlist_del(&dup->hlist_dup);
ksm_stable_node_dups--;
}
-static inline void stable_node_dup_del(struct stable_node *dup)
+static inline void stable_node_dup_del(struct ksm_stable_node *dup)
{
VM_BUG_ON(is_stable_node_chain(dup));
if (is_stable_node_dup(dup))
@@ -373,9 +371,9 @@ static inline void stable_node_dup_del(struct stable_node *dup)
#endif
}
-static inline struct rmap_item *alloc_rmap_item(void)
+static inline struct ksm_rmap_item *alloc_rmap_item(void)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
__GFP_NORETRY | __GFP_NOWARN);
@@ -384,14 +382,15 @@ static inline struct rmap_item *alloc_rmap_item(void)
return rmap_item;
}
-static inline void free_rmap_item(struct rmap_item *rmap_item)
+static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
{
ksm_rmap_items--;
+ rmap_item->mm->ksm_rmap_items--;
rmap_item->mm = NULL; /* debug safety */
kmem_cache_free(rmap_item_cache, rmap_item);
}
-static inline struct stable_node *alloc_stable_node(void)
+static inline struct ksm_stable_node *alloc_stable_node(void)
{
/*
* The allocation can take too long with GFP_KERNEL when memory is under
@@ -401,43 +400,13 @@ static inline struct stable_node *alloc_stable_node(void)
return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
}
-static inline void free_stable_node(struct stable_node *stable_node)
+static inline void free_stable_node(struct ksm_stable_node *stable_node)
{
VM_BUG_ON(stable_node->rmap_hlist_len &&
!is_stable_node_chain(stable_node));
kmem_cache_free(stable_node_cache, stable_node);
}
-static inline struct mm_slot *alloc_mm_slot(void)
-{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *slot;
-
- hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
- if (slot->mm == mm)
- return slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
-}
-
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -451,47 +420,74 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret;
+
+ if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (pte_present(*pte)) {
+ page = vm_normal_page(walk->vma, addr, *pte);
+ } else if (!pte_none(*pte)) {
+ swp_entry_t entry = pte_to_swp_entry(*pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ ret = page && PageKsm(page);
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+};
+
/*
- * We use break_ksm to break COW on a ksm page: it's a stripped down
- *
- * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
- * put_page(page);
+ * We use break_ksm to break COW on a ksm page by triggering unsharing,
+ * such that the ksm page will get replaced by an exclusive anonymous page.
*
- * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem, where we would not want to touch it.
*
- * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
vm_fault_t ret = 0;
do {
+ int ksm_page;
+
cond_resched();
- page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
- break;
- if (PageKsm(page))
- ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
- NULL);
- else
- ret = VM_FAULT_WRITE;
- put_page(page);
- } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1,
+ &break_ksm_ops, NULL);
+ if (WARN_ON_ONCE(ksm_page < 0))
+ return ksm_page;
+ if (!ksm_page)
+ return 0;
+ ret = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
- * We must loop because handle_mm_fault() may back out if there's
- * any difficulty e.g. if pte accessed bit gets updated concurrently.
- *
- * VM_FAULT_WRITE is what we have been hoping for: it indicates that
- * COW has been broken, even if the vma does not permit VM_WRITE;
- * but note that a concurrent fault might break PageKsm for us.
+ * We must loop until we no longer find a KSM page because
+ * handle_mm_fault() may back out if there's any difficulty e.g. if
+ * pte accessed bit gets updated concurrently.
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
@@ -528,7 +524,7 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
return vma;
}
-static void break_cow(struct rmap_item *rmap_item)
+static void break_cow(struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
@@ -547,7 +543,7 @@ static void break_cow(struct rmap_item *rmap_item)
mmap_read_unlock(mm);
}
-static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
@@ -560,12 +556,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
goto out;
page = follow_page(vma, addr, FOLL_GET);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+ if (IS_ERR_OR_NULL(page))
goto out;
+ if (is_zone_device_page(page))
+ goto out_putpage;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
+out_putpage:
put_page(page);
out:
page = NULL;
@@ -585,10 +584,10 @@ static inline int get_kpfn_nid(unsigned long kpfn)
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
-static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
struct rb_root *root)
{
- struct stable_node *chain = alloc_stable_node();
+ struct ksm_stable_node *chain = alloc_stable_node();
VM_BUG_ON(is_stable_node_chain(dup));
if (likely(chain)) {
INIT_HLIST_HEAD(&chain->hlist);
@@ -618,7 +617,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
return chain;
}
-static inline void free_stable_node_chain(struct stable_node *chain,
+static inline void free_stable_node_chain(struct ksm_stable_node *chain,
struct rb_root *root)
{
rb_erase(&chain->node, root);
@@ -626,9 +625,9 @@ static inline void free_stable_node_chain(struct stable_node *chain,
ksm_stable_node_chains--;
}
-static void remove_node_from_stable_tree(struct stable_node *stable_node)
+static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
/* check it's not STABLE_NODE_CHAIN or negative */
BUG_ON(stable_node->rmap_hlist_len < 0);
@@ -690,7 +689,7 @@ enum get_ksm_page_flags {
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node,
+static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
enum get_ksm_page_flags flags)
{
struct page *page;
@@ -769,10 +768,10 @@ stale:
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
*/
-static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
struct page *page;
stable_node = rmap_item->head;
@@ -819,10 +818,10 @@ out:
cond_resched(); /* we're called from many long loops */
}
-static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
{
while (*rmap_list) {
- struct rmap_item *rmap_item = *rmap_list;
+ struct ksm_rmap_item *rmap_item = *rmap_list;
*rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
free_rmap_item(rmap_item);
@@ -859,18 +858,18 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
-static inline struct stable_node *folio_stable_node(struct folio *folio)
+static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
{
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
-static inline struct stable_node *page_stable_node(struct page *page)
+static inline struct ksm_stable_node *page_stable_node(struct page *page)
{
return folio_stable_node(page_folio(page));
}
static inline void set_page_stable_node(struct page *page,
- struct stable_node *stable_node)
+ struct ksm_stable_node *stable_node)
{
VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
@@ -880,7 +879,7 @@ static inline void set_page_stable_node(struct page *page,
/*
* Only called through the sysfs control interface:
*/
-static int remove_stable_node(struct stable_node *stable_node)
+static int remove_stable_node(struct ksm_stable_node *stable_node)
{
struct page *page;
int err;
@@ -918,10 +917,10 @@ static int remove_stable_node(struct stable_node *stable_node)
return err;
}
-static int remove_stable_node_chain(struct stable_node *stable_node,
+static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
struct rb_root *root)
{
- struct stable_node *dup;
+ struct ksm_stable_node *dup;
struct hlist_node *hlist_safe;
if (!is_stable_node_chain(stable_node)) {
@@ -945,14 +944,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node,
static int remove_all_stable_nodes(void)
{
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
int nid;
int err = 0;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
- struct stable_node, node);
+ struct ksm_stable_node, node);
if (remove_stable_node_chain(stable_node,
root_stable_tree + nid)) {
err = -EBUSY;
@@ -971,21 +970,25 @@ static int remove_all_stable_nodes(void)
static int unmerge_and_remove_all_rmap_items(void)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int err = 0;
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(ksm_mm_head.slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
spin_unlock(&ksm_mmlist_lock);
- for (mm_slot = ksm_scan.mm_slot;
- mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
- mm = mm_slot->mm;
+ for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
+ mm_slot = ksm_scan.mm_slot) {
+ VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
+
+ mm = mm_slot->slot.mm;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (ksm_test_exit(mm))
break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
@@ -1000,14 +1003,15 @@ static int unmerge_and_remove_all_rmap_items(void)
mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (ksm_test_exit(mm)) {
- hash_del(&mm_slot->link);
- list_del(&mm_slot->mm_list);
+ hash_del(&mm_slot->slot.hash);
+ list_del(&mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else
@@ -1065,7 +1069,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
anon_exclusive = PageAnonExclusive(page);
if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
- (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
pte_t entry;
@@ -1095,6 +1098,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
}
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
@@ -1102,11 +1106,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (pte_dirty(entry))
set_page_dirty(page);
+ entry = pte_mkclean(entry);
+
+ if (pte_write(entry))
+ entry = pte_wrprotect(entry);
- if (pte_protnone(entry))
- entry = pte_mkclean(pte_clear_savedwrite(entry));
- else
- entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
}
*orig_pte = *pvmw.pte;
@@ -1133,7 +1137,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
+ struct folio *folio;
pmd_t *pmd;
+ pmd_t pmde;
pte_t *ptep;
pte_t newpte;
spinlock_t *ptl;
@@ -1148,6 +1154,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
+ /*
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = *pmd;
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ goto out;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
@@ -1191,10 +1206,11 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
+ folio = page_folio(page);
page_remove_rmap(page, vma, false);
- if (!page_mapped(page))
- try_to_free_swap(page);
- put_page(page);
+ if (!folio_mapped(folio))
+ folio_free_swap(folio);
+ folio_put(folio);
pte_unmap_unlock(ptep, ptl);
err = 0;
@@ -1278,7 +1294,7 @@ out:
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
-static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
struct page *page, struct page *kpage)
{
struct mm_struct *mm = rmap_item->mm;
@@ -1315,9 +1331,9 @@ out:
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
-static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
struct page *page,
- struct rmap_item *tree_rmap_item,
+ struct ksm_rmap_item *tree_rmap_item,
struct page *tree_page)
{
int err;
@@ -1337,7 +1353,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
}
static __always_inline
-bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
{
VM_BUG_ON(stable_node->rmap_hlist_len < 0);
/*
@@ -1351,17 +1367,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
}
static __always_inline
-bool is_page_sharing_candidate(struct stable_node *stable_node)
+bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
{
return __is_page_sharing_candidate(stable_node, 0);
}
-static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
+static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
struct rb_root *root,
bool prune_stale_stable_nodes)
{
- struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
+ struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
struct hlist_node *hlist_safe;
struct page *_tree_page, *tree_page = NULL;
int nr = 0;
@@ -1475,7 +1491,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
return tree_page;
}
-static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
struct rb_root *root)
{
if (!is_stable_node_chain(stable_node))
@@ -1502,12 +1518,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
* function and will be overwritten in all cases, the caller doesn't
* need to initialize it.
*/
-static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
+static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
struct rb_root *root,
bool prune_stale_stable_nodes)
{
- struct stable_node *stable_node = *_stable_node;
+ struct ksm_stable_node *stable_node = *_stable_node;
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
@@ -1524,18 +1540,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
prune_stale_stable_nodes);
}
-static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
- struct stable_node **s_n,
+static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node **s_n,
struct rb_root *root)
{
return __stable_node_chain(s_n_d, s_n, root, true);
}
-static __always_inline struct page *chain(struct stable_node **s_n_d,
- struct stable_node *s_n,
+static __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node *s_n,
struct rb_root *root)
{
- struct stable_node *old_stable_node = s_n;
+ struct ksm_stable_node *old_stable_node = s_n;
struct page *tree_page;
tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
@@ -1559,8 +1575,8 @@ static struct page *stable_tree_search(struct page *page)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
- struct stable_node *page_node;
+ struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct ksm_stable_node *page_node;
page_node = page_stable_node(page);
if (page_node && page_node->head != &migrate_nodes) {
@@ -1580,7 +1596,7 @@ again:
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
tree_page = chain_prune(&stable_node_dup, &stable_node, root);
/*
@@ -1803,14 +1819,14 @@ chain_append:
* This function returns the stable tree node just allocated on success,
* NULL otherwise.
*/
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
{
int nid;
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
bool need_chain = false;
kpfn = page_to_pfn(kpage);
@@ -1825,7 +1841,7 @@ again:
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
tree_page = chain(&stable_node_dup, stable_node, root);
if (!stable_node_dup) {
@@ -1894,7 +1910,7 @@ again:
rb_insert_color(&stable_node_dup->node, root);
} else {
if (!is_stable_node_chain(stable_node)) {
- struct stable_node *orig = stable_node;
+ struct ksm_stable_node *orig = stable_node;
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(orig, root);
if (!stable_node) {
@@ -1923,7 +1939,7 @@ again:
* the same walking algorithm in an rbtree.
*/
static
-struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
{
@@ -1937,12 +1953,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
new = &root->rb_node;
while (*new) {
- struct rmap_item *tree_rmap_item;
+ struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page;
int ret;
cond_resched();
- tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
if (!tree_page)
return NULL;
@@ -1994,8 +2010,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
* rmap_items hanging off a given node of the stable tree, all sharing
* the same ksm page.
*/
-static void stable_tree_append(struct rmap_item *rmap_item,
- struct stable_node *stable_node,
+static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+ struct ksm_stable_node *stable_node,
bool max_page_sharing_bypass)
{
/*
@@ -2037,12 +2053,12 @@ static void stable_tree_append(struct rmap_item *rmap_item,
* @page: the page that we are searching identical page to.
* @rmap_item: the reverse mapping into the virtual address of this page
*/
-static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
+static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
- struct rmap_item *tree_rmap_item;
+ struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
@@ -2198,11 +2214,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
}
}
-static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
- struct rmap_item **rmap_list,
+static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
+ struct ksm_rmap_item **rmap_list,
unsigned long addr)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
while (*rmap_list) {
rmap_item = *rmap_list;
@@ -2218,7 +2234,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
rmap_item = alloc_rmap_item();
if (rmap_item) {
/* It has already been zeroed */
- rmap_item->mm = mm_slot->mm;
+ rmap_item->mm = mm_slot->slot.mm;
+ rmap_item->mm->ksm_rmap_items++;
rmap_item->address = addr;
rmap_item->rmap_list = *rmap_list;
*rmap_list = rmap_item;
@@ -2226,19 +2243,21 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
return rmap_item;
}
-static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
+ struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
struct vm_area_struct *vma;
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
+ struct vma_iterator vmi;
int nid;
- if (list_empty(&ksm_mm_head.mm_list))
+ if (list_empty(&ksm_mm_head.slot.mm_node))
return NULL;
- slot = ksm_scan.mm_slot;
- if (slot == &ksm_mm_head) {
+ mm_slot = ksm_scan.mm_slot;
+ if (mm_slot == &ksm_mm_head) {
/*
* A number of pages can hang around indefinitely on per-cpu
* pagevecs, raised page count preventing write_protect_page
@@ -2258,7 +2277,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
struct page *page;
list_for_each_entry_safe(stable_node, next,
@@ -2275,28 +2294,31 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
- slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
- ksm_scan.mm_slot = slot;
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
+ ksm_scan.mm_slot = mm_slot;
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
* of the last mm on the list may have removed it since then.
*/
- if (slot == &ksm_mm_head)
+ if (mm_slot == &ksm_mm_head)
return NULL;
next_mm:
ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+ ksm_scan.rmap_list = &mm_slot->rmap_list;
}
+ slot = &mm_slot->slot;
mm = slot->mm;
+ vma_iter_init(&vmi, mm, ksm_scan.address);
+
mmap_read_lock(mm);
if (ksm_test_exit(mm))
- vma = NULL;
- else
- vma = find_vma(mm, ksm_scan.address);
+ goto no_vmas;
- for (; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
@@ -2308,15 +2330,17 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
}
+ if (is_zone_device_page(*page))
+ goto next_page;
if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
- rmap_item = get_next_rmap_item(slot,
+ rmap_item = get_next_rmap_item(mm_slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
@@ -2327,6 +2351,7 @@ next_mm:
mmap_read_unlock(mm);
return rmap_item;
}
+next_page:
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
@@ -2334,8 +2359,9 @@ next_mm:
}
if (ksm_test_exit(mm)) {
+no_vmas:
ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+ ksm_scan.rmap_list = &mm_slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
@@ -2344,8 +2370,9 @@ next_mm:
remove_trailing_rmap_items(ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_lock
@@ -2356,11 +2383,11 @@ next_mm:
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_lock then protects against race with MADV_MERGEABLE).
*/
- hash_del(&slot->link);
- list_del(&slot->mm_list);
+ hash_del(&mm_slot->slot.hash);
+ list_del(&mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmap_read_unlock(mm);
mmdrop(mm);
@@ -2377,8 +2404,8 @@ next_mm:
}
/* Repeat until we've completed scanning the whole list */
- slot = ksm_scan.mm_slot;
- if (slot != &ksm_mm_head)
+ mm_slot = ksm_scan.mm_slot;
+ if (mm_slot != &ksm_mm_head)
goto next_mm;
ksm_scan.seqnr++;
@@ -2391,7 +2418,7 @@ next_mm:
*/
static void ksm_do_scan(unsigned int scan_npages)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
@@ -2406,7 +2433,7 @@ static void ksm_do_scan(unsigned int scan_npages)
static int ksmd_should_run(void)
{
- return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+ return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
}
static int ksm_scan_thread(void *nothing)
@@ -2495,18 +2522,21 @@ EXPORT_SYMBOL_GPL(ksm_madvise);
int __ksm_enter(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
int needs_wakeup;
- mm_slot = alloc_mm_slot();
+ mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return -ENOMEM;
+ slot = &mm_slot->slot;
+
/* Check ksm_run too? Would need tighter locking */
- needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
spin_lock(&ksm_mmlist_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
+ mm_slot_insert(mm_slots_hash, mm, slot);
/*
* When KSM_RUN_MERGE (or KSM_RUN_STOP),
* insert just behind the scanning cursor, to let the area settle
@@ -2518,9 +2548,9 @@ int __ksm_enter(struct mm_struct *mm)
* missed: then we might as well insert at the end of the list.
*/
if (ksm_run & KSM_RUN_UNMERGE)
- list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
else
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -2534,7 +2564,8 @@ int __ksm_enter(struct mm_struct *mm)
void __ksm_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
int easy_to_free = 0;
/*
@@ -2547,21 +2578,22 @@ void __ksm_exit(struct mm_struct *mm)
*/
spin_lock(&ksm_mmlist_lock);
- mm_slot = get_mm_slot(mm);
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
- hash_del(&mm_slot->link);
- list_del(&mm_slot->mm_list);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
easy_to_free = 1;
} else {
- list_move(&mm_slot->mm_list,
- &ksm_scan.mm_slot->mm_list);
+ list_move(&slot->mm_node,
+ &ksm_scan.mm_slot->slot.mm_node);
}
}
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
@@ -2612,8 +2644,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
+ struct ksm_stable_node *stable_node;
+ struct ksm_rmap_item *rmap_item;
int search_new_forks = 0;
VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
@@ -2683,7 +2715,7 @@ again:
#ifdef CONFIG_MIGRATION
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
@@ -2716,7 +2748,7 @@ static void wait_while_offlining(void)
}
}
-static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -2732,12 +2764,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node,
return false;
}
-static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
unsigned long start_pfn,
unsigned long end_pfn,
struct rb_root *root)
{
- struct stable_node *dup;
+ struct ksm_stable_node *dup;
struct hlist_node *hlist_safe;
if (!is_stable_node_chain(stable_node)) {
@@ -2761,14 +2793,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node,
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
struct rb_node *node;
int nid;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
node = rb_first(root_stable_tree + nid);
while (node) {
- stable_node = rb_entry(node, struct stable_node, node);
+ stable_node = rb_entry(node, struct ksm_stable_node, node);
if (stable_node_chain_remove_range(stable_node,
start_pfn, end_pfn,
root_stable_tree +
@@ -3206,7 +3238,7 @@ static int __init ksm_init(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
- hotplug_memory_notifier(ksm_memory_callback, 100);
+ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
return 0;
diff --git a/mm/maccess.c b/mm/maccess.c
index 5f4d240f67ec..074f6b086671 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -97,7 +97,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
return src - unsafe_addr;
Efault:
pagefault_enable();
- dst[-1] = '\0';
+ dst[0] = '\0';
return -EFAULT;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 5f0f0948a50e..a56a6d17e201 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -94,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
mmap_assert_locked(vma->vm_mm);
- if (vma->vm_file)
- return NULL;
-
return vma->anon_name;
}
@@ -182,7 +180,7 @@ success:
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
- if (!vma->vm_file) {
+ if (!vma->vm_file || vma_is_anon_shmem(vma)) {
error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
@@ -225,6 +223,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
put_page(page);
}
swap_read_unplug(splug);
+ cond_resched();
return 0;
}
@@ -320,6 +319,21 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+{
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(&init_user_ns,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -333,10 +347,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
spinlock_t *ptl;
struct page *page = NULL;
LIST_HEAD(page_list);
+ bool pageout_anon_only_filter;
if (fatal_signal_pending(current))
return -EINTR;
+ pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+ !can_do_file_pageout(vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
@@ -363,6 +381,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (page_mapcount(page) != 1)
goto huge_unlock;
+ if (pageout_anon_only_filter && !PageAnon(page))
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;
@@ -431,6 +452,8 @@ regular_page:
if (PageTransCompound(page)) {
if (page_mapcount(page) != 1)
break;
+ if (pageout_anon_only_filter && !PageAnon(page))
+ break;
get_page(page);
if (!trylock_page(page)) {
put_page(page);
@@ -451,8 +474,14 @@ regular_page:
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
+ continue;
+
+ if (pageout_anon_only_filter && !PageAnon(page))
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -549,23 +578,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
-static inline bool can_do_pageout(struct vm_area_struct *vma)
-{
- if (vma_is_anonymous(vma))
- return true;
- if (!vma->vm_file)
- return false;
- /*
- * paging out pagecache only for non-anonymous mappings that correspond
- * to the files the calling process could (if tried) open for writing;
- * otherwise we'd be including shared non-exclusive mappings, which
- * opens a side channel.
- */
- return inode_owner_or_capable(&init_user_ns,
- file_inode(vma->vm_file)) ||
- file_permission(vma->vm_file, MAY_WRITE) == 0;
-}
-
static long madvise_pageout(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
@@ -577,7 +589,14 @@ static long madvise_pageout(struct vm_area_struct *vma,
if (!can_madv_lru_vma(vma))
return -EINVAL;
- if (!can_do_pageout(vma))
+ /*
+ * If the VMA belongs to a private file mapping, there can be private
+ * dirty pages which can be paged out if even this process is neither
+ * owner nor write capable of the file. We allow private file mappings
+ * further to pageout dirty anon pages.
+ */
+ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+ (vma->vm_flags & VM_MAYSHARE)))
return 0;
lru_add_drain();
@@ -597,6 +616,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte, *pte, ptent;
+ struct folio *folio;
struct page *page;
int nr_swap = 0;
unsigned long next;
@@ -641,56 +661,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
page = vm_normal_page(vma, addr, ptent);
if (!page || is_zone_device_page(page))
continue;
+ folio = page_folio(page);
/*
- * If pmd isn't transhuge but the page is THP and
+ * If pmd isn't transhuge but the folio is large and
* is owned by only this process, split it and
* deactivate all pages.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ if (folio_mapcount(folio) != 1)
goto out;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
goto out;
}
pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (split_folio(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
}
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
- if (PageSwapCache(page) || PageDirty(page)) {
- if (!trylock_page(page))
+ if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
+ if (!folio_trylock(folio))
continue;
/*
- * If page is shared with others, we couldn't clear
- * PG_dirty of the page.
+ * If folio is shared with others, we mustn't clear
+ * the folio's dirty flag.
*/
- if (page_mapcount(page) != 1) {
- unlock_page(page);
+ if (folio_mapcount(folio) != 1) {
+ folio_unlock(folio);
continue;
}
- if (PageSwapCache(page) && !try_to_free_swap(page)) {
- unlock_page(page);
+ if (folio_test_swapcache(folio) &&
+ !folio_free_swap(folio)) {
+ folio_unlock(folio);
continue;
}
- ClearPageDirty(page);
- unlock_page(page);
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
}
if (pte_young(ptent) || pte_dirty(ptent)) {
@@ -708,7 +728,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(page);
+ mark_page_lazyfree(&folio->page);
}
out:
if (nr_swap) {
@@ -767,8 +787,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
@@ -785,7 +805,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range(vma, start, end - start);
+ zap_page_range_single(vma, start, end - start, NULL);
return 0;
}
@@ -808,7 +828,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (start & ~huge_page_mask(hstate_vma(vma)))
return false;
- *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+ /*
+ * Madvise callers expect the length to be rounded up to PAGE_SIZE
+ * boundaries, and may be unaware that this VMA uses huge pages.
+ * Avoid unexpected data loss by rounding down the number of
+ * huge pages freed.
+ */
+ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
return true;
}
@@ -823,6 +850,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
+ if (start == end)
+ return 0;
+
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
@@ -1057,6 +1087,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1150,6 +1182,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
+ case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
@@ -1166,13 +1199,13 @@ madvise_behavior_valid(int behavior)
}
}
-static bool
-process_madvise_behavior_valid(int behavior)
+static bool process_madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_WILLNEED:
+ case MADV_COLLAPSE:
return true;
default:
return false;
@@ -1238,7 +1271,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
if (start >= end)
break;
if (prev)
- vma = prev->vm_next;
+ vma = find_vma(mm, prev->vm_end);
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
@@ -1255,7 +1288,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
int error;
/* Only anonymous mappings can be named */
- if (vma->vm_file)
+ if (vma->vm_file && !vma_is_anon_shmem(vma))
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
@@ -1339,6 +1372,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
+ * MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
@@ -1440,7 +1474,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto out;
}
- ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret < 0)
goto out;
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 1b0ab8fcfd8b..175e424b9ab1 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- pmd_t pmdval = pmd_read_atomic(pmd);
+ pmd_t pmdval = pmdp_get_lockless(pmd);
if (!pmd_trans_unstable(&pmdval))
return 0;
diff --git a/mm/memblock.c b/mm/memblock.c
index b5d3026979fc..511d4783dcf1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2000,7 +2000,7 @@ static void __init free_unused_memmap(void)
* presume that there are no holes in the memory map inside
* a pageblock
*/
- start = round_down(start, pageblock_nr_pages);
+ start = pageblock_start_pfn(start);
/*
* If we had a previous bank, and there is a space
@@ -2014,12 +2014,12 @@ static void __init free_unused_memmap(void)
* presume that there are no holes in the memory map inside
* a pageblock
*/
- prev_end = ALIGN(end, pageblock_nr_pages);
+ prev_end = pageblock_align(end);
}
#ifdef CONFIG_SPARSEMEM
if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) {
- prev_end = ALIGN(end, pageblock_nr_pages);
+ prev_end = pageblock_align(end);
free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
}
#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..ab457f0394ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/parser.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -88,13 +89,6 @@ static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;
-/* Whether the swap controller is active */
-#ifdef CONFIG_MEMCG_SWAP
-static bool cgroup_memory_noswap __ro_after_init;
-#else
-#define cgroup_memory_noswap 1
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -102,7 +96,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
}
#define THRESHOLDS_EVENTS_TARGET 128
@@ -597,25 +591,18 @@ static u64 flush_next_time;
*/
static void memcg_stats_lock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_disable();
-#else
- VM_BUG_ON(!irqs_disabled());
-#endif
+ preempt_disable_nested();
+ VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_disable();
-#endif
+ preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_enable();
-#endif
+ preempt_enable_nested();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
@@ -669,6 +656,83 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+ PGPGIN,
+ PGPGOUT,
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
+ PGSCAN_KHUGEPAGED,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGSTEAL_KHUGEPAGED,
+ PGFAULT,
+ PGMAJFAULT,
+ PGREFILL,
+ PGACTIVATE,
+ PGDEACTIVATE,
+ PGLAZYFREE,
+ PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ ZSWPIN,
+ ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+ THP_COLLAPSE_ALLOC,
+#endif
+};
+
+#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
+static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+
+static void init_memcg_events(void)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_EVENTS; ++i)
+ mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+}
+
+static inline int memcg_events_index(enum vm_event_item idx)
+{
+ return mem_cgroup_events_index[idx] - 1;
+}
+
+struct memcg_vmstats_percpu {
+ /* Local (CPU and cgroup) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_MEMCG_EVENTS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[MEMCG_NR_STAT];
+ unsigned long events_prev[NR_MEMCG_EVENTS];
+
+ /* Cgroup1: threshold notifications & softlimit tree updates */
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct memcg_vmstats {
+ /* Aggregated (CPU and subtree) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_MEMCG_EVENTS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[MEMCG_NR_STAT];
+ unsigned long events_pending[NR_MEMCG_EVENTS];
+};
+
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->vmstats->state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
@@ -715,7 +779,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
* interrupt context while other caller need to have disabled interrupt.
*/
__memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
switch (idx) {
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
@@ -725,7 +789,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
WARN_ON_ONCE(!in_task());
break;
default:
- WARN_ON_ONCE(!irqs_disabled());
+ VM_WARN_ON_IRQS_ENABLED();
}
}
@@ -816,27 +880,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
- if (mem_cgroup_disabled())
+ int index = memcg_events_index(idx);
+
+ if (mem_cgroup_disabled() || index < 0)
return;
memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ __this_cpu_add(memcg->vmstats_percpu->events[index], count);
memcg_rstat_updated(memcg, count);
memcg_stats_unlock();
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return READ_ONCE(memcg->vmstats.events[event]);
+ int index = memcg_events_index(event);
+
+ if (index < 0)
+ return 0;
+ return READ_ONCE(memcg->vmstats->events[index]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
long x = 0;
int cpu;
+ int index = memcg_events_index(event);
+
+ if (index < 0)
+ return 0;
for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
+ x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
return x;
}
@@ -1143,12 +1217,12 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
} while ((memcg = parent_mem_cgroup(memcg)));
/*
- * When cgruop1 non-hierarchy mode is used,
+ * When cgroup1 non-hierarchy mode is used,
* parent_mem_cgroup() does not walk all the way up to the
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
- if (last != root_mem_cgroup)
+ if (!mem_cgroup_is_root(last))
__invalidate_reclaim_iterators(root_mem_cgroup,
dead_memcg);
}
@@ -1172,7 +1246,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
+ BUG_ON(mem_cgroup_is_root(memcg));
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1201,7 +1275,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
memcg = folio_memcg(folio);
if (!memcg)
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
+ VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
else
VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
}
@@ -1401,6 +1475,7 @@ static const struct memory_stat memory_stats[] = {
{ "kernel", MEMCG_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
+ { "sec_pagetables", NR_SECONDARY_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
{ "sock", MEMCG_SOCK },
{ "vmalloc", MEMCG_VMALLOC },
@@ -1467,29 +1542,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-/* Subset of vm_event_item to report for memcg event stats */
-static const unsigned int memcg_vm_event_stat[] = {
- PGSCAN_KSWAPD,
- PGSCAN_DIRECT,
- PGSTEAL_KSWAPD,
- PGSTEAL_DIRECT,
- PGFAULT,
- PGMAJFAULT,
- PGREFILL,
- PGACTIVATE,
- PGDEACTIVATE,
- PGLAZYFREE,
- PGLAZYFREED,
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
- ZSWPIN,
- ZSWPOUT,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- THP_FAULT_ALLOC,
- THP_COLLAPSE_ALLOC,
-#endif
-};
-
static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
{
struct seq_buf s;
@@ -1525,15 +1577,22 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
/* Accumulated memory events */
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
- memcg_events(memcg, PGSCAN_DIRECT));
+ memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
- memcg_events(memcg, PGSTEAL_DIRECT));
+ memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_KHUGEPAGED));
+
+ for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
+ if (memcg_vm_event_stat[i] == PGPGIN ||
+ memcg_vm_event_stat[i] == PGPGOUT)
+ continue;
- for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
seq_buf_printf(&s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
+ }
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
@@ -1607,17 +1666,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max = READ_ONCE(memcg->memory.max);
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- if (mem_cgroup_swappiness(memcg))
- max += min(READ_ONCE(memcg->swap.max),
- (unsigned long)total_swap_pages);
- } else { /* v1 */
+ if (do_memsw_account()) {
if (mem_cgroup_swappiness(memcg)) {
/* Calculate swap excess capacity from memsw limit */
unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
max += min(swap, (unsigned long)total_swap_pages);
}
+ } else {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
}
return max;
}
@@ -1982,7 +2041,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
rcu_read_lock();
memcg = mem_cgroup_from_task(victim);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
/*
@@ -2334,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP);
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2625,7 +2685,8 @@ retry:
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options);
+ gfp_mask, reclaim_options,
+ NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2789,6 +2850,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
@@ -2940,7 +3002,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg = NULL;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
if (objcg && obj_cgroup_tryget(objcg))
break;
@@ -2971,7 +3033,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || memcg_kmem_bypass())
+ if (!memcg_kmem_enabled())
return NULL;
if (PageMemcgKmem(page)) {
@@ -3362,7 +3424,7 @@ void split_page_memcg(struct page *head, unsigned int nr)
css_get_many(&memcg->css, nr - 1);
}
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
@@ -3444,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
+ NULL)) {
ret = -EBUSY;
break;
}
@@ -3555,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP))
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL))
nr_retries--;
}
@@ -3975,6 +4039,8 @@ static const unsigned int memcg1_stats[] = {
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
+ WORKINGSET_REFAULT_ANON,
+ WORKINGSET_REFAULT_FILE,
MEMCG_SWAP,
};
@@ -3988,6 +4054,8 @@ static const char *const memcg1_stat_names[] = {
"mapped_file",
"dirty",
"writeback",
+ "workingset_refault_anon",
+ "workingset_refault_file",
"swap",
};
@@ -4016,7 +4084,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4047,7 +4116,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)nr * PAGE_SIZE);
+ (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4772,6 +4841,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
+ struct dentry *cdentry;
const char *name;
char *endp;
int ret;
@@ -4826,6 +4896,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
goto out_put_cfile;
/*
+ * The control file must be a regular cgroup1 file. As a regular cgroup
+ * file can't be renamed, it's safe to access its name afterwards.
+ */
+ cdentry = cfile.file->f_path.dentry;
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
@@ -4833,7 +4913,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
*
* DO NOT ADD NEW FILES.
*/
- name = cfile.file->f_path.dentry->d_name.name;
+ name = cdentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
@@ -4857,7 +4937,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
@@ -5110,8 +5190,8 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
struct mem_cgroup *memcg;
cgrp = cgroup_get_from_id(ino);
- if (!cgrp)
- return ERR_PTR(-ENOENT);
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
if (css)
@@ -5164,12 +5244,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
+ kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
}
@@ -5192,6 +5274,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
+ memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
+ if (!memcg->vmstats)
+ goto fail;
+
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
@@ -5228,6 +5314,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
@@ -5262,6 +5349,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
+ init_memcg_events();
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
@@ -5410,9 +5498,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
* below us. We're in a per-cpu loop here and this is
* a global counter, so the first cycle will get them.
*/
- delta = memcg->vmstats.state_pending[i];
+ delta = memcg->vmstats->state_pending[i];
if (delta)
- memcg->vmstats.state_pending[i] = 0;
+ memcg->vmstats->state_pending[i] = 0;
/* Add CPU changes on this level since the last flush */
v = READ_ONCE(statc->state[i]);
@@ -5425,15 +5513,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
continue;
/* Aggregate counts on this level and propagate upwards */
- memcg->vmstats.state[i] += delta;
+ memcg->vmstats->state[i] += delta;
if (parent)
- parent->vmstats.state_pending[i] += delta;
+ parent->vmstats->state_pending[i] += delta;
}
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- delta = memcg->vmstats.events_pending[i];
+ for (i = 0; i < NR_MEMCG_EVENTS; i++) {
+ delta = memcg->vmstats->events_pending[i];
if (delta)
- memcg->vmstats.events_pending[i] = 0;
+ memcg->vmstats->events_pending[i] = 0;
v = READ_ONCE(statc->events[i]);
if (v != statc->events_prev[i]) {
@@ -5444,9 +5532,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (!delta)
continue;
- memcg->vmstats.events[i] += delta;
+ memcg->vmstats->events[i] += delta;
if (parent)
- parent->vmstats.events_pending[i] += delta;
+ parent->vmstats->events_pending[i] += delta;
}
for_each_node_state(nid, N_MEMORY) {
@@ -5561,7 +5649,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return NULL;
/*
- * Because lookup_swap_cache() updates some statistics counter,
+ * Because swap_cache_get_folio() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
@@ -5580,15 +5668,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
+ unsigned long index;
+ struct folio *folio;
+
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- /* page is moved even if it's not RSS of this task(page-faulted). */
+ /* folio is moved even if it's not RSS of this task(page-faulted). */
/* shmem/tmpfs may report page out on swap: account for that too. */
- return find_get_incore_page(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
+ index = linear_page_index(vma, addr);
+ folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
+ if (!folio)
+ return NULL;
+ return folio_file_page(folio, index);
}
/**
@@ -5673,6 +5767,12 @@ static int mem_cgroup_move_account(struct page *page,
}
}
+#ifdef CONFIG_SWAP
+ if (folio_test_swapcache(folio)) {
+ __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
+ }
+#endif
if (folio_test_writeback(folio)) {
__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
@@ -5871,7 +5971,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
mmap_read_lock(mm);
- walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
mmap_read_unlock(mm);
precharge = mc.precharge;
@@ -6169,9 +6269,7 @@ retry:
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
- NULL);
-
+ walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
}
@@ -6196,6 +6294,30 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ /* find the first leader if there is any */
+ cgroup_taskset_for_each_leader(task, css, tset)
+ break;
+
+ if (!task)
+ return;
+
+ task_lock(task);
+ if (task->mm && READ_ONCE(task->mm->owner) == task)
+ lru_gen_migrate_mm(task->mm);
+ task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
@@ -6307,7 +6429,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
if (!reclaimed && !nr_retries--)
break;
@@ -6356,7 +6479,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL))
nr_reclaims--;
continue;
}
@@ -6479,21 +6603,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
+enum {
+ MEMORY_RECLAIM_NODES = 0,
+ MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t if_tokens = {
+ { MEMORY_RECLAIM_NODES, "nodes=%s" },
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
- unsigned int reclaim_options;
- int err;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+ char value[256];
+ nodemask_t nodemask = NODE_MASK_ALL;
buf = strstrip(buf);
- err = page_counter_memparse(buf, "", &nr_to_reclaim);
- if (err)
- return err;
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ token = match_token(start, if_tokens, args);
+ match_strlcpy(value, args, sizeof(value));
+ switch (token) {
+ case MEMORY_RECLAIM_NODES:
+ if (nodelist_parse(value, nodemask) < 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@@ -6510,7 +6667,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options);
+ GFP_KERNEL, reclaim_options,
+ &nodemask);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -6601,6 +6759,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+ .attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.dfl_cftypes = memory_files,
@@ -6813,21 +6972,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
- * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
- * @page: page to charge
+ * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
+ * @folio: folio to charge.
* @mm: mm context of the victim
* @gfp: reclaim mode
- * @entry: swap entry for which the page is allocated
+ * @entry: swap entry for which the folio is allocated
*
- * This function charges a page allocated for swapin. Please call this before
- * adding the page to the swapcache.
+ * This function charges a folio allocated for swapin. Please call this before
+ * adding the folio to the swapcache.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry)
{
- struct folio *folio = page_folio(page);
struct mem_cgroup *memcg;
unsigned short id;
int ret;
@@ -7073,7 +7231,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
@@ -7200,7 +7358,7 @@ static int __init mem_cgroup_init(void)
}
subsys_initcall(mem_cgroup_init);
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
while (!refcount_inc_not_zero(&memcg->id.ref)) {
@@ -7208,7 +7366,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
*/
- if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
VM_BUG_ON(1);
break;
}
@@ -7238,7 +7396,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
if (mem_cgroup_disabled())
return;
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (!do_memsw_account())
return;
memcg = folio_memcg(folio);
@@ -7267,7 +7425,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (!cgroup_memory_noswap && memcg != swap_memcg) {
+ if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7303,7 +7461,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (do_memsw_account())
return 0;
memcg = folio_memcg(folio);
@@ -7319,7 +7477,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
- if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
+ if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7347,15 +7505,18 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
+ if (mem_cgroup_disabled())
+ return;
+
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->swap, nr_pages);
- else
+ if (!mem_cgroup_is_root(memcg)) {
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
+ else
+ page_counter_uncharge(&memcg->swap, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
@@ -7367,31 +7528,31 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
- if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
-bool mem_cgroup_swap_full(struct page *page)
+bool mem_cgroup_swap_full(struct folio *folio)
{
struct mem_cgroup *memcg;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (vm_swap_full())
return true;
- if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (do_memsw_account())
return false;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
@@ -7404,10 +7565,9 @@ bool mem_cgroup_swap_full(struct page *page)
static int __init setup_swap_account(char *s)
{
- if (!strcmp(s, "1"))
- cgroup_memory_noswap = false;
- else if (!strcmp(s, "0"))
- cgroup_memory_noswap = true;
+ pr_warn_once("The swapaccount= commandline option is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
return 1;
}
__setup("swapaccount=", setup_swap_account);
@@ -7556,7 +7716,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
return true;
original_memcg = get_mem_cgroup_from_objcg(objcg);
- for (memcg = original_memcg; memcg != root_mem_cgroup;
+ for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
memcg = parent_mem_cgroup(memcg)) {
unsigned long max = READ_ONCE(memcg->zswap_max);
unsigned long pages;
@@ -7676,20 +7836,9 @@ static struct cftype zswap_files[] = {
};
#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
-/*
- * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
- * instead of a core_initcall(), this could mean cgroup_memory_noswap still
- * remains set to false even when memcg is disabled via "cgroup_disable=memory"
- * boot parameter. This may result in premature OOPS inside
- * mem_cgroup_get_nr_swap_pages() function in corner cases.
- */
static int __init mem_cgroup_swap_init(void)
{
- /* No memory control -> no swap control */
if (mem_cgroup_disabled())
- cgroup_memory_noswap = true;
-
- if (cgroup_memory_noswap)
return 0;
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
@@ -7699,6 +7848,6 @@ static int __init mem_cgroup_swap_init(void)
#endif
return 0;
}
-core_initcall(mem_cgroup_swap_init);
+subsys_initcall(mem_cgroup_swap_init);
-#endif /* CONFIG_MEMCG_SWAP */
+#endif /* CONFIG_SWAP */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 14439806b5ef..c77a9e37e27e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,6 +74,19 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
+inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+ atomic_long_inc(&num_poisoned_pages);
+ memblk_nr_poison_inc(pfn);
+}
+
+inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+ atomic_long_sub(i, &num_poisoned_pages);
+ if (pfn != -1UL)
+ memblk_nr_poison_sub(pfn, i);
+}
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -115,7 +128,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
if (release)
put_page(page);
page_ref_inc(page);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
return true;
}
@@ -277,7 +290,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
* to SIG_IGN, but hopefully no one will do that?
*/
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
- addr_lsb, t); /* synchronous? */
+ addr_lsb, t);
if (ret < 0)
pr_info("Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
@@ -345,13 +358,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* not much we can do. We just print a message and ignore otherwise.
*/
+#define FSDAX_INVALID_PGOFF ULONG_MAX
+
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*
- * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
- * In other cases, such as anonymous and file-backend page, the address to be
- * killed can be caculated by @p itself.
+ * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
+ * filesystem with a memory failure handler has claimed the
+ * memory_failure event. In all other cases, page->index and
+ * page->mapping are sufficient for mapping the page back to its
+ * corresponding user virtual address.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
@@ -367,11 +384,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p)) {
- /*
- * Since page->mapping is not used for fsdax, we need
- * calculate the address based on the vma.
- */
- if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
} else
@@ -413,7 +426,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
{
struct to_kill *tk, *next;
- list_for_each_entry_safe (tk, next, to_kill, nd) {
+ list_for_each_entry_safe(tk, next, to_kill, nd) {
if (forcekill) {
/*
* In case something went wrong with munmapping
@@ -437,6 +450,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
+ list_del(&tk->nd);
put_task_struct(tk->tsk);
kfree(tk);
}
@@ -520,14 +534,15 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
+ if (vma->vm_mm != t->mm)
+ continue;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma_read(av);
+ anon_vma_unlock_read(av);
}
/*
@@ -559,7 +574,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -632,7 +648,7 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
swp_entry_t swp = pte_to_swp_entry(pte);
if (is_hwpoison_entry(swp))
- pfn = hwpoison_entry_to_pfn(swp);
+ pfn = swp_offset_pfn(swp);
}
if (!pfn || pfn != poisoned_pfn)
@@ -743,6 +759,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
};
priv.tk.tsk = p;
+ if (!p->mm)
+ return -EFAULT;
+
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
(void *)&priv);
@@ -821,12 +840,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
int ret = MF_FAILED;
if (mapping->a_ops->error_remove_page) {
+ struct folio *folio = page_folio(p);
int err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
- } else if (page_has_private(p) &&
- !try_to_release_page(p, GFP_NOIO)) {
+ } else if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_NOIO)) {
pr_info("%#lx: failed to release buffers\n", pfn);
} else {
ret = MF_RECOVERED;
@@ -1074,6 +1094,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1081,6 +1102,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1098,7 +1121,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
@@ -1173,14 +1196,16 @@ static struct page_state error_states[] = {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, enum mf_action_page_type type,
- enum mf_result result)
+static int action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(pfn);
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
+
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
static int page_action(struct page_state *ps, struct page *p,
@@ -1191,14 +1216,12 @@ static int page_action(struct page_state *ps, struct page *p,
/* page p should be unlocked after returning from ps->action(). */
result = ps->action(ps, p);
- action_result(pfn, ps->type, result);
-
/* Could do more checks here if page looks ok */
/*
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+ return action_result(pfn, ps->type, result);
}
static inline bool PageHWPoisonTakenOff(struct page *page)
@@ -1238,14 +1261,14 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb);
+ ret = get_hwpoison_huge_page(head, &hugetlb, false);
if (hugetlb)
return ret;
/*
- * This check prevents from calling get_hwpoison_unless_zero()
- * for any unsupported type of page in order to reduce the risk of
- * unexpected races caused by taking a page refcount.
+ * This check prevents from calling get_page_unless_zero() for any
+ * unsupported type of page in order to reduce the risk of unexpected
+ * races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head, flags))
return -EBUSY;
@@ -1328,7 +1351,7 @@ static int __get_unpoison_page(struct page *page)
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb);
+ ret = get_hwpoison_huge_page(head, &hugetlb, true);
if (hugetlb)
return ret;
@@ -1396,14 +1419,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
- int kill = 1, forcekill;
+ int forcekill;
bool mlocked = PageMlocked(hpage);
/*
* Here we are interested only in user-mapped pages, so skip any
* other types of pages.
*/
- if (PageReserved(p) || PageSlab(p))
+ if (PageReserved(p) || PageSlab(p) || PageTable(p))
return true;
if (!(PageLRU(hpage) || PageHuge(p)))
return true;
@@ -1437,7 +1460,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
- kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
@@ -1448,12 +1470,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
- *
- * Error handling: We ignore errors here because
- * there's nothing that can be done.
*/
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
if (PageHuge(hpage) && !PageAnon(hpage)) {
/*
@@ -1495,7 +1513,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+ !unmap_success;
kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
return unmap_success;
@@ -1524,20 +1543,18 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
-static int try_to_split_thp_page(struct page *page, const char *msg)
+static int try_to_split_thp_page(struct page *page)
{
+ int ret;
+
lock_page(page);
- if (unlikely(split_huge_page(page))) {
- unsigned long pfn = page_to_pfn(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
- unlock_page(page);
- pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ if (unlikely(ret))
put_page(page);
- return -EBUSY;
- }
- unlock_page(page);
- return 0;
+ return ret;
}
static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
@@ -1671,8 +1688,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
#ifdef CONFIG_HUGETLB_PAGE
/*
* Struct raw_hwp_page represents information about "raw error page",
- * constructing singly linked list originated from ->private field of
- * SUBPAGE_INDEX_HWPOISON-th tail page.
+ * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
*/
struct raw_hwp_page {
struct llist_node node;
@@ -1681,7 +1697,7 @@ struct raw_hwp_page {
static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
{
- return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
+ return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison;
}
static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
@@ -1696,6 +1712,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
if (move_flag)
SetPageHWPoison(p->page);
+ else
+ num_poisoned_pages_sub(page_to_pfn(p->page), 1);
kfree(p);
count++;
}
@@ -1731,7 +1749,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
llist_add(&raw_hwp->node, head);
/* the first error event will be counted in action_result(). */
if (ret)
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -1785,7 +1803,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage)
* -EBUSY - the hugepage is busy (try to retry)
* -EHWPOISON - the hugepage is already hwpoisoned
*/
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
struct page *head = compound_head(page);
@@ -1815,6 +1834,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
goto out;
}
+ /*
+ * Clearing HPageMigratable for hwpoisoned hugepages to prevent them
+ * from being migrated by memory hotremove.
+ */
+ if (count_increased && HPageMigratable(head)) {
+ ClearHPageMigratable(head);
+ *migratable_cleared = true;
+ }
+
return ret;
out:
if (count_increased)
@@ -1834,10 +1862,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
struct page *p = pfn_to_page(pfn);
struct page *head;
unsigned long page_flags;
+ bool migratable_cleared = false;
*hugetlb = 1;
retry:
- res = get_huge_page_for_hwpoison(pfn, flags);
+ res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
if (res == 2) { /* fallback to normal page handling */
*hugetlb = 0;
return 0;
@@ -1853,8 +1882,7 @@ retry:
flags |= MF_NO_RETRY;
goto retry;
}
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- return res;
+ return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
}
head = compound_head(p);
@@ -1862,8 +1890,12 @@ retry:
if (hwpoison_filter(p)) {
hugetlb_clear_page_hwpoison(head);
- res = -EOPNOTSUPP;
- goto out;
+ if (migratable_cleared)
+ SetHPageMigratable(head);
+ unlock_page(head);
+ if (res == 1)
+ put_page(head);
+ return -EOPNOTSUPP;
}
/*
@@ -1878,22 +1910,17 @@ retry:
} else {
res = MF_FAILED;
}
- action_result(pfn, MF_MSG_FREE_HUGE, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
+ return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
page_flags = head->flags;
if (!hwpoison_user_mappings(p, pfn, flags, head)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ unlock_page(head);
+ return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
}
return identify_page_state(pfn, p, page_flags);
-out:
- unlock_page(head);
- return res;
}
#else
@@ -1908,17 +1935,25 @@ static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
}
#endif /* CONFIG_HUGETLB_PAGE */
+/* Drop the extra refcount in case we come from madvise() */
+static void put_ref_page(unsigned long pfn, int flags)
+{
+ struct page *page;
+
+ if (!(flags & MF_COUNT_INCREASED))
+ return;
+
+ page = pfn_to_page(pfn);
+ if (page)
+ put_page(page);
+}
+
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
- struct page *page = pfn_to_page(pfn);
int rc = -ENXIO;
- if (flags & MF_COUNT_INCREASED)
- /*
- * Drop the extra refcount in case we come from madvise().
- */
- put_page(page);
+ put_ref_page(pfn, flags);
/* device metadata space is not recoverable */
if (!pgmap_pfn_valid(pgmap, pfn))
@@ -1928,7 +1963,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* Call driver's implementation to handle the memory failure, otherwise
* fall back to generic handler.
*/
- if (pgmap->ops->memory_failure) {
+ if (pgmap_has_memory_failure(pgmap)) {
rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
* Fall back to generic handler too if operation is not
@@ -2026,7 +2061,7 @@ try_again:
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
- * prep_new_page() will be the gate keeper.
+ * check_new_page() will be the gate keeper.
* 2) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
@@ -2050,16 +2085,13 @@ try_again:
}
res = MF_FAILED;
}
- action_result(pfn, MF_MSG_BUDDY, res);
- res = res == MF_RECOVERED ? 0 : -EBUSY;
+ res = action_result(pfn, MF_MSG_BUDDY, res);
} else {
- action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
}
goto unlock_mutex;
} else if (res < 0) {
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
goto unlock_mutex;
}
}
@@ -2079,9 +2111,8 @@ try_again:
* page is a valid handlable page.
*/
SetPageHasHWPoisoned(hpage);
- if (try_to_split_thp_page(p, "Memory Failure") < 0) {
- action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- res = -EBUSY;
+ if (try_to_split_thp_page(p) < 0) {
+ res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
@@ -2114,8 +2145,7 @@ try_again:
retry = false;
goto try_again;
}
- action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
goto unlock_page;
}
@@ -2129,7 +2159,7 @@ try_again:
page_flags = p->flags;
if (hwpoison_filter(p)) {
- TestClearPageHWPoison(p);
+ ClearPageHWPoison(p);
unlock_page(p);
put_page(p);
res = -EOPNOTSUPP;
@@ -2155,8 +2185,7 @@ try_again:
* Abort on fail: __filemap_remove_folio() assumes unmapped page.
*/
if (!hwpoison_user_mappings(p, pfn, flags, p)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
goto unlock_page;
}
@@ -2164,8 +2193,7 @@ try_again:
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
goto unlock_page;
}
@@ -2310,8 +2338,8 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int ret = -EBUSY;
- int freeit = 0;
unsigned long count = 1;
+ bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -2354,12 +2382,13 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (PageSlab(page) || PageTable(page))
+ if (PageSlab(page) || PageTable(page) || PageReserved(page))
goto unlock_mutex;
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
if (PageHuge(p)) {
+ huge = true;
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
@@ -2375,16 +2404,17 @@ int unpoison_memory(unsigned long pfn)
pfn, &unpoison_rs);
} else {
if (PageHuge(p)) {
+ huge = true;
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
+ put_page(page);
goto unlock_mutex;
}
}
- freeit = !!TestClearPageHWPoison(p);
put_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
+ if (TestClearPageHWPoison(p)) {
put_page(page);
ret = 0;
}
@@ -2392,8 +2422,9 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
- if (!ret || freeit) {
- num_poisoned_pages_sub(count);
+ if (!ret) {
+ if (!huge)
+ num_poisoned_pages_sub(pfn, 1);
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), &unpoison_rs);
}
@@ -2404,24 +2435,26 @@ EXPORT_SYMBOL(unpoison_memory);
static bool isolate_page(struct page *page, struct list_head *pagelist)
{
bool isolated = false;
- bool lru = PageLRU(page);
if (PageHuge(page)) {
isolated = !isolate_hugetlb(page, pagelist);
} else {
+ bool lru = !__PageMovable(page);
+
if (lru)
isolated = !isolate_lru_page(page);
else
- isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+ isolated = !isolate_movable_page(page,
+ ISOLATE_UNEVICTABLE);
- if (isolated)
+ if (isolated) {
list_add(&page->lru, pagelist);
+ if (lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+ }
}
- if (isolated && lru)
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
-
/*
* If we succeed to isolate the page, we grabbed another refcount on
* the page, so we can safely drop the one we got from get_any_pages().
@@ -2434,11 +2467,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
}
/*
- * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
* If the page is a non-dirty unmapped page-cache page, it simply invalidates.
* If the page is mapped, it migrates the contents over.
*/
-static int __soft_offline_page(struct page *page)
+static int soft_offline_in_use_page(struct page *page)
{
long ret = 0;
unsigned long pfn = page_to_pfn(page);
@@ -2451,6 +2484,14 @@ static int __soft_offline_page(struct page *page)
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
+ if (!huge && PageTransHuge(hpage)) {
+ if (try_to_split_thp_page(page)) {
+ pr_info("soft offline: %#lx: thp split failed\n", pfn);
+ return -EBUSY;
+ }
+ hpage = page;
+ }
+
lock_page(page);
if (!PageHuge(page))
wait_on_page_writeback(page);
@@ -2500,32 +2541,6 @@ static int __soft_offline_page(struct page *page)
return ret;
}
-static int soft_offline_in_use_page(struct page *page)
-{
- struct page *hpage = compound_head(page);
-
- if (!PageHuge(page) && PageTransHuge(hpage))
- if (try_to_split_thp_page(page, "soft offline") < 0)
- return -EBUSY;
- return __soft_offline_page(page);
-}
-
-static int soft_offline_free_page(struct page *page)
-{
- int rc = 0;
-
- if (!page_handle_poison(page, true, false))
- rc = -EBUSY;
-
- return rc;
-}
-
-static void put_ref_page(struct page *page)
-{
- if (page)
- put_page(page);
-}
-
/**
* soft_offline_page - Soft offline a page.
* @pfn: pfn to soft-offline
@@ -2554,19 +2569,17 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
bool try_again = true;
- struct page *page, *ref_page = NULL;
-
- WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
+ struct page *page;
- if (!pfn_valid(pfn))
+ if (!pfn_valid(pfn)) {
+ WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
return -ENXIO;
- if (flags & MF_COUNT_INCREASED)
- ref_page = pfn_to_page(pfn);
+ }
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
if (!page) {
- put_ref_page(ref_page);
+ put_ref_page(pfn, flags);
return -EIO;
}
@@ -2574,7 +2587,7 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
- put_ref_page(ref_page);
+ put_ref_page(pfn, flags);
mutex_unlock(&mf_mutex);
return 0;
}
@@ -2587,8 +2600,6 @@ retry:
if (hwpoison_filter(page)) {
if (ret > 0)
put_page(page);
- else
- put_ref_page(ref_page);
mutex_unlock(&mf_mutex);
return -EOPNOTSUPP;
@@ -2597,7 +2608,7 @@ retry:
if (ret > 0) {
ret = soft_offline_in_use_page(page);
} else if (ret == 0) {
- if (soft_offline_free_page(page) && try_again) {
+ if (!page_handle_poison(page, true, false) && try_again) {
try_again = false;
flags &= ~MF_COUNT_INCREASED;
goto retry;
@@ -2608,24 +2619,3 @@ retry:
return ret;
}
-
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
- int i;
-
- /*
- * A further optimization is to have per section refcounted
- * num_poisoned_pages. But that would need more space per memmap, so
- * for now just do a quick global check to speed up this routine in the
- * absence of bad pages.
- */
- if (atomic_long_read(&num_poisoned_pages) == 0)
- return;
-
- for (i = 0; i < nr_pages; i++) {
- if (PageHWPoison(&memmap[i])) {
- num_poisoned_pages_dec();
- ClearPageHWPoison(&memmap[i]);
- }
- }
-}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
new file mode 100644
index 000000000000..c734658c6242
--- /dev/null
+++ b/mm/memory-tiers.c
@@ -0,0 +1,732 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/slab.h>
+#include <linux/lockdep.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/memory.h>
+#include <linux/memory-tiers.h>
+
+#include "internal.h"
+
+struct memory_tier {
+ /* hierarchy of memory tiers */
+ struct list_head list;
+ /* list of all memory types part of this tier */
+ struct list_head memory_types;
+ /*
+ * start value of abstract distance. memory tier maps
+ * an abstract distance range,
+ * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+ */
+ int adistance_start;
+ struct device dev;
+ /* All the nodes that are part of all the lower memory tiers. */
+ nodemask_t lower_tier_mask;
+};
+
+struct demotion_nodes {
+ nodemask_t preferred;
+};
+
+struct node_memory_type_map {
+ struct memory_dev_type *memtype;
+ int map_count;
+};
+
+static DEFINE_MUTEX(memory_tier_lock);
+static LIST_HEAD(memory_tiers);
+static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
+static struct memory_dev_type *default_dram_type;
+
+static struct bus_type memory_tier_subsys = {
+ .name = "memory_tiering",
+ .dev_name = "memory_tier",
+};
+
+#ifdef CONFIG_MIGRATION
+static int top_tier_adistance;
+/*
+ * node_demotion[] examples:
+ *
+ * Example 1:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
+ *
+ * node distances:
+ * node 0 1 2 3
+ * 0 10 20 30 40
+ * 1 20 10 40 30
+ * 2 30 40 10 40
+ * 3 40 30 40 10
+ *
+ * memory_tiers0 = 0-1
+ * memory_tiers1 = 2-3
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 3
+ * node_demotion[2].preferred = <empty>
+ * node_demotion[3].preferred = <empty>
+ *
+ * Example 2:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
+ *
+ * node distances:
+ * node 0 1 2
+ * 0 10 20 30
+ * 1 20 10 30
+ * 2 30 30 10
+ *
+ * memory_tiers0 = 0-2
+ *
+ * node_demotion[0].preferred = <empty>
+ * node_demotion[1].preferred = <empty>
+ * node_demotion[2].preferred = <empty>
+ *
+ * Example 3:
+ *
+ * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
+ *
+ * node distances:
+ * node 0 1 2
+ * 0 10 20 30
+ * 1 20 10 40
+ * 2 30 40 10
+ *
+ * memory_tiers0 = 1
+ * memory_tiers1 = 0
+ * memory_tiers2 = 2
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 0
+ * node_demotion[2].preferred = <empty>
+ *
+ */
+static struct demotion_nodes *node_demotion __read_mostly;
+#endif /* CONFIG_MIGRATION */
+
+static inline struct memory_tier *to_memory_tier(struct device *device)
+{
+ return container_of(device, struct memory_tier, dev);
+}
+
+static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+{
+ nodemask_t nodes = NODE_MASK_NONE;
+ struct memory_dev_type *memtype;
+
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ nodes_or(nodes, nodes, memtype->nodes);
+
+ return nodes;
+}
+
+static void memory_tier_device_release(struct device *dev)
+{
+ struct memory_tier *tier = to_memory_tier(dev);
+ /*
+ * synchronize_rcu in clear_node_memory_tier makes sure
+ * we don't have rcu access to this memory tier.
+ */
+ kfree(tier);
+}
+
+static ssize_t nodelist_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ nodemask_t nmask;
+
+ mutex_lock(&memory_tier_lock);
+ nmask = get_memtier_nodemask(to_memory_tier(dev));
+ ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+ mutex_unlock(&memory_tier_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(nodelist);
+
+static struct attribute *memtier_dev_attrs[] = {
+ &dev_attr_nodelist.attr,
+ NULL
+};
+
+static const struct attribute_group memtier_dev_group = {
+ .attrs = memtier_dev_attrs,
+};
+
+static const struct attribute_group *memtier_dev_groups[] = {
+ &memtier_dev_group,
+ NULL
+};
+
+static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
+{
+ int ret;
+ bool found_slot = false;
+ struct memory_tier *memtier, *new_memtier;
+ int adistance = memtype->adistance;
+ unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ adistance = round_down(adistance, memtier_adistance_chunk_size);
+ /*
+ * If the memtype is already part of a memory tier,
+ * just return that.
+ */
+ if (!list_empty(&memtype->tier_sibiling)) {
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (adistance == memtier->adistance_start)
+ return memtier;
+ }
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (adistance == memtier->adistance_start) {
+ goto link_memtype;
+ } else if (adistance < memtier->adistance_start) {
+ found_slot = true;
+ break;
+ }
+ }
+
+ new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
+ if (!new_memtier)
+ return ERR_PTR(-ENOMEM);
+
+ new_memtier->adistance_start = adistance;
+ INIT_LIST_HEAD(&new_memtier->list);
+ INIT_LIST_HEAD(&new_memtier->memory_types);
+ if (found_slot)
+ list_add_tail(&new_memtier->list, &memtier->list);
+ else
+ list_add_tail(&new_memtier->list, &memory_tiers);
+
+ new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
+ new_memtier->dev.bus = &memory_tier_subsys;
+ new_memtier->dev.release = memory_tier_device_release;
+ new_memtier->dev.groups = memtier_dev_groups;
+
+ ret = device_register(&new_memtier->dev);
+ if (ret) {
+ list_del(&memtier->list);
+ put_device(&memtier->dev);
+ return ERR_PTR(ret);
+ }
+ memtier = new_memtier;
+
+link_memtype:
+ list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ return memtier;
+}
+
+static struct memory_tier *__node_get_memory_tier(int node)
+{
+ pg_data_t *pgdat;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return NULL;
+ /*
+ * Since we hold memory_tier_lock, we can avoid
+ * RCU read locks when accessing the details. No
+ * parallel updates are possible here.
+ */
+ return rcu_dereference_check(pgdat->memtier,
+ lockdep_is_held(&memory_tier_lock));
+}
+
+#ifdef CONFIG_MIGRATION
+bool node_is_toptier(int node)
+{
+ bool toptier;
+ pg_data_t *pgdat;
+ struct memory_tier *memtier;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return false;
+
+ rcu_read_lock();
+ memtier = rcu_dereference(pgdat->memtier);
+ if (!memtier) {
+ toptier = true;
+ goto out;
+ }
+ if (memtier->adistance_start <= top_tier_adistance)
+ toptier = true;
+ else
+ toptier = false;
+out:
+ rcu_read_unlock();
+ return toptier;
+}
+
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ struct memory_tier *memtier;
+
+ /*
+ * pg_data_t.memtier updates includes a synchronize_rcu()
+ * which ensures that we either find NULL or a valid memtier
+ * in NODE_DATA. protect the access via rcu_read_lock();
+ */
+ rcu_read_lock();
+ memtier = rcu_dereference(pgdat->memtier);
+ if (memtier)
+ *targets = memtier->lower_tier_mask;
+ else
+ *targets = NODE_MASK_NONE;
+ rcu_read_unlock();
+}
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+ struct demotion_nodes *nd;
+ int target;
+
+ if (!node_demotion)
+ return NUMA_NO_NODE;
+
+ nd = &node_demotion[node];
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ /*
+ * If there are multiple target nodes, just select one
+ * target node randomly.
+ *
+ * In addition, we can also use round-robin to select
+ * target node, but we should introduce another variable
+ * for node_demotion[] to record last selected target node,
+ * that may cause cache ping-pong due to the changing of
+ * last target node. Or introducing per-cpu data to avoid
+ * caching issue, which seems more complicated. So selecting
+ * target node randomly seems better until now.
+ */
+ target = node_random(&nd->preferred);
+ rcu_read_unlock();
+
+ return target;
+}
+
+static void disable_all_demotion_targets(void)
+{
+ struct memory_tier *memtier;
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ node_demotion[node].preferred = NODE_MASK_NONE;
+ /*
+ * We are holding memory_tier_lock, it is safe
+ * to access pgda->memtier.
+ */
+ memtier = __node_get_memory_tier(node);
+ if (memtier)
+ memtier->lower_tier_mask = NODE_MASK_NONE;
+ }
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for all memory
+ * nodes. Failing here is OK. It might just indicate
+ * being at the end of a chain.
+ */
+static void establish_demotion_targets(void)
+{
+ struct memory_tier *memtier;
+ struct demotion_nodes *nd;
+ int target = NUMA_NO_NODE, node;
+ int distance, best_distance;
+ nodemask_t tier_nodes, lower_tier;
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
+ return;
+
+ disable_all_demotion_targets();
+
+ for_each_node_state(node, N_MEMORY) {
+ best_distance = -1;
+ nd = &node_demotion[node];
+
+ memtier = __node_get_memory_tier(node);
+ if (!memtier || list_is_last(&memtier->list, &memory_tiers))
+ continue;
+ /*
+ * Get the lower memtier to find the demotion node list.
+ */
+ memtier = list_next_entry(memtier, list);
+ tier_nodes = get_memtier_nodemask(memtier);
+ /*
+ * find_next_best_node, use 'used' nodemask as a skip list.
+ * Add all memory nodes except the selected memory tier
+ * nodelist to skip list so that we find the best node from the
+ * memtier nodelist.
+ */
+ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
+
+ /*
+ * Find all the nodes in the memory tier node list of same best distance.
+ * add them to the preferred mask. We randomly select between nodes
+ * in the preferred mask when allocating pages during demotion.
+ */
+ do {
+ target = find_next_best_node(node, &tier_nodes);
+ if (target == NUMA_NO_NODE)
+ break;
+
+ distance = node_distance(node, target);
+ if (distance == best_distance || best_distance == -1) {
+ best_distance = distance;
+ node_set(target, nd->preferred);
+ } else {
+ break;
+ }
+ } while (1);
+ }
+ /*
+ * Promotion is allowed from a memory tier to higher
+ * memory tier only if the memory tier doesn't include
+ * compute. We want to skip promotion from a memory tier,
+ * if any node that is part of the memory tier have CPUs.
+ * Once we detect such a memory tier, we consider that tier
+ * as top tiper from which promotion is not allowed.
+ */
+ list_for_each_entry_reverse(memtier, &memory_tiers, list) {
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
+ if (!nodes_empty(tier_nodes)) {
+ /*
+ * abstract distance below the max value of this memtier
+ * is considered toptier.
+ */
+ top_tier_adistance = memtier->adistance_start +
+ MEMTIER_CHUNK_SIZE - 1;
+ break;
+ }
+ }
+ /*
+ * Now build the lower_tier mask for each node collecting node mask from
+ * all memory tier below it. This allows us to fallback demotion page
+ * allocation to a set of nodes that is closer the above selected
+ * perferred node.
+ */
+ lower_tier = node_states[N_MEMORY];
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ /*
+ * Keep removing current tier from lower_tier nodes,
+ * This will remove all nodes in current and above
+ * memory tier from the lower_tier mask.
+ */
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_andnot(lower_tier, lower_tier, tier_nodes);
+ memtier->lower_tier_mask = lower_tier;
+ }
+}
+
+#else
+static inline void disable_all_demotion_targets(void) {}
+static inline void establish_demotion_targets(void) {}
+#endif /* CONFIG_MIGRATION */
+
+static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+ if (!node_memory_types[node].memtype)
+ node_memory_types[node].memtype = memtype;
+ /*
+ * for each device getting added in the same NUMA node
+ * with this specific memtype, bump the map count. We
+ * Only take memtype device reference once, so that
+ * changing a node memtype can be done by droping the
+ * only reference count taken here.
+ */
+
+ if (node_memory_types[node].memtype == memtype) {
+ if (!node_memory_types[node].map_count++)
+ kref_get(&memtype->kref);
+ }
+}
+
+static struct memory_tier *set_node_memory_tier(int node)
+{
+ struct memory_tier *memtier;
+ struct memory_dev_type *memtype;
+ pg_data_t *pgdat = NODE_DATA(node);
+
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ if (!node_state(node, N_MEMORY))
+ return ERR_PTR(-EINVAL);
+
+ __init_node_memory_type(node, default_dram_type);
+
+ memtype = node_memory_types[node].memtype;
+ node_set(node, memtype->nodes);
+ memtier = find_create_memory_tier(memtype);
+ if (!IS_ERR(memtier))
+ rcu_assign_pointer(pgdat->memtier, memtier);
+ return memtier;
+}
+
+static void destroy_memory_tier(struct memory_tier *memtier)
+{
+ list_del(&memtier->list);
+ device_unregister(&memtier->dev);
+}
+
+static bool clear_node_memory_tier(int node)
+{
+ bool cleared = false;
+ pg_data_t *pgdat;
+ struct memory_tier *memtier;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return false;
+
+ /*
+ * Make sure that anybody looking at NODE_DATA who finds
+ * a valid memtier finds memory_dev_types with nodes still
+ * linked to the memtier. We achieve this by waiting for
+ * rcu read section to finish using synchronize_rcu.
+ * This also enables us to free the destroyed memory tier
+ * with kfree instead of kfree_rcu
+ */
+ memtier = __node_get_memory_tier(node);
+ if (memtier) {
+ struct memory_dev_type *memtype;
+
+ rcu_assign_pointer(pgdat->memtier, NULL);
+ synchronize_rcu();
+ memtype = node_memory_types[node].memtype;
+ node_clear(node, memtype->nodes);
+ if (nodes_empty(memtype->nodes)) {
+ list_del_init(&memtype->tier_sibiling);
+ if (list_empty(&memtier->memory_types))
+ destroy_memory_tier(memtier);
+ }
+ cleared = true;
+ }
+ return cleared;
+}
+
+static void release_memtype(struct kref *kref)
+{
+ struct memory_dev_type *memtype;
+
+ memtype = container_of(kref, struct memory_dev_type, kref);
+ kfree(memtype);
+}
+
+struct memory_dev_type *alloc_memory_type(int adistance)
+{
+ struct memory_dev_type *memtype;
+
+ memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
+ if (!memtype)
+ return ERR_PTR(-ENOMEM);
+
+ memtype->adistance = adistance;
+ INIT_LIST_HEAD(&memtype->tier_sibiling);
+ memtype->nodes = NODE_MASK_NONE;
+ kref_init(&memtype->kref);
+ return memtype;
+}
+EXPORT_SYMBOL_GPL(alloc_memory_type);
+
+void destroy_memory_type(struct memory_dev_type *memtype)
+{
+ kref_put(&memtype->kref, release_memtype);
+}
+EXPORT_SYMBOL_GPL(destroy_memory_type);
+
+void init_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+
+ mutex_lock(&memory_tier_lock);
+ __init_node_memory_type(node, memtype);
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(init_node_memory_type);
+
+void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+ mutex_lock(&memory_tier_lock);
+ if (node_memory_types[node].memtype == memtype)
+ node_memory_types[node].map_count--;
+ /*
+ * If we umapped all the attached devices to this node,
+ * clear the node memory type.
+ */
+ if (!node_memory_types[node].map_count) {
+ node_memory_types[node].memtype = NULL;
+ kref_put(&memtype->kref, release_memtype);
+ }
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(clear_node_memory_type);
+
+static int __meminit memtier_hotplug_callback(struct notifier_block *self,
+ unsigned long action, void *_arg)
+{
+ struct memory_tier *memtier;
+ struct memory_notify *arg = _arg;
+
+ /*
+ * Only update the node migration order when a node is
+ * changing status, like online->offline.
+ */
+ if (arg->status_change_nid < 0)
+ return notifier_from_errno(0);
+
+ switch (action) {
+ case MEM_OFFLINE:
+ mutex_lock(&memory_tier_lock);
+ if (clear_node_memory_tier(arg->status_change_nid))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+ break;
+ case MEM_ONLINE:
+ mutex_lock(&memory_tier_lock);
+ memtier = set_node_memory_tier(arg->status_change_nid);
+ if (!IS_ERR(memtier))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+ break;
+ }
+
+ return notifier_from_errno(0);
+}
+
+static int __init memory_tier_init(void)
+{
+ int ret, node;
+ struct memory_tier *memtier;
+
+ ret = subsys_virtual_register(&memory_tier_subsys, NULL);
+ if (ret)
+ panic("%s() failed to register memory tier subsystem\n", __func__);
+
+#ifdef CONFIG_MIGRATION
+ node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
+ GFP_KERNEL);
+ WARN_ON(!node_demotion);
+#endif
+ mutex_lock(&memory_tier_lock);
+ /*
+ * For now we can have 4 faster memory tiers with smaller adistance
+ * than default DRAM tier.
+ */
+ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+ if (IS_ERR(default_dram_type))
+ panic("%s() failed to allocate default DRAM tier\n", __func__);
+
+ /*
+ * Look at all the existing N_MEMORY nodes and add them to
+ * default memory tier or to a tier if we already have memory
+ * types assigned.
+ */
+ for_each_node_state(node, N_MEMORY) {
+ memtier = set_node_memory_tier(node);
+ if (IS_ERR(memtier))
+ /*
+ * Continue with memtiers we are able to setup
+ */
+ break;
+ }
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+
+ hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ return 0;
+}
+subsys_initcall(memory_tier_init);
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_MIGRATION
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled ? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &numa_demotion_enabled);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif /* CONFIG_SYSFS */
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 4ba73f5aa8bb..aad226daf41b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
+#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -66,6 +67,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
@@ -74,6 +76,7 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/kmem.h>
@@ -125,18 +128,6 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
- /*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
- */
- return true;
-}
-#endif
-
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -171,58 +162,11 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
-{
- trace_rss_stat(mm, member, count);
-}
-
-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (current->rss_stat.count[i]) {
- add_mm_counter(mm, i, current->rss_stat.count[i]);
- current->rss_stat.count[i] = 0;
- }
- }
- current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
- struct task_struct *task = current;
-
- if (likely(task->mm == mm))
- task->rss_stat.count[member] += val;
- else
- add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
- if (unlikely(task != current))
- return;
- if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
-
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
-
-static void check_sync_rss_stat(struct task_struct *task)
+void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
+ trace_rss_stat(mm, member);
}
-#endif /* SPLIT_RSS_COUNTING */
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -402,12 +346,21 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *vma, unsigned long floor,
+ unsigned long ceiling)
{
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+
+ do {
unsigned long addr = vma->vm_start;
+ struct vm_area_struct *next;
+
+ /*
+ * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
+ * be 0. This will underflow and is okay.
+ */
+ next = mas_find(&mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
@@ -426,7 +379,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = vma->vm_next;
+ next = mas_find(&mas, ceiling - 1);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
@@ -434,7 +387,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
floor, next ? next->vm_start : ceiling);
}
vma = next;
- }
+ } while (vma);
}
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
@@ -1341,15 +1294,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return ret;
}
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
- struct folio *single_folio; /* Locked folio to be unmapped */
- bool even_cows; /* Zap COWed private pages too? */
- zap_flags_t zap_flags; /* Extra flags for zapping */
-};
-
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
@@ -1430,6 +1374,8 @@ again:
break;
if (pte_present(ptent)) {
+ unsigned int delay_rmap;
+
page = vm_normal_page(vma, addr, ptent);
if (unlikely(!should_zap_page(details, page)))
continue;
@@ -1441,20 +1387,26 @@ again:
if (unlikely(!page))
continue;
+ delay_rmap = 0;
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
- force_flush = 1;
set_page_dirty(page);
+ if (tlb_delay_rmap(tlb)) {
+ delay_rmap = 1;
+ force_flush = 1;
+ }
}
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
- page_remove_rmap(page, vma, false);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- if (unlikely(__tlb_remove_page(tlb, page))) {
+ if (!delay_rmap) {
+ page_remove_rmap(page, vma, false);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
force_flush = 1;
addr += PAGE_SIZE;
break;
@@ -1511,8 +1463,10 @@ again:
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
- if (force_flush)
+ if (force_flush) {
tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_rmaps(tlb, vma);
+ }
pte_unmap_unlock(start_pte, ptl);
/*
@@ -1685,10 +1639,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file) {
zap_flags_t zap_flags = details ?
details->zap_flags : 0;
- i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end,
NULL, zap_flags);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
@@ -1698,6 +1650,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
+ * @mt: the maple tree
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -1713,22 +1666,24 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
struct mmu_notifier_range range;
struct zap_details details = {
- .zap_flags = ZAP_FLAG_DROP_MARKER,
+ .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ do {
unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
+ } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1743,8 +1698,11 @@ void unmap_vmas(struct mmu_gather *tlb,
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
+ struct maple_tree *mt = &vma->vm_mm->mm_mt;
+ unsigned long end = start + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -1752,8 +1710,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ do {
unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ } while ((vma = mas_find(&mas, end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -1767,19 +1726,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
*
* The range must fit into one VMA.
*/
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
+ const unsigned long end = address + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address, address + size);
+ address, end);
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- unmap_single_vma(&tlb, vma, address, range.end, details);
+ /*
+ * unmap 'address-end' not 'range.start-range.end' as range
+ * could have been expanded for hugetlb pmd sharing.
+ */
+ unmap_single_vma(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -1853,7 +1820,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
@@ -2841,10 +2808,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
return same;
}
-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
- struct vm_fault *vmf)
+/*
+ * Return:
+ * 0: copied succeeded
+ * -EHWPOISON: copy failed due to hwpoison in source page
+ * -EAGAIN: copied failed (some other reason)
+ */
+static inline int __wp_page_copy_user(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
- bool ret;
+ int ret;
void *kaddr;
void __user *uaddr;
bool locked = false;
@@ -2853,8 +2826,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
unsigned long addr = vmf->address;
if (likely(src)) {
- copy_user_highpage(dst, src, addr, vma);
- return true;
+ if (copy_mc_user_highpage(dst, src, addr, vma)) {
+ memory_failure_queue(page_to_pfn(src), 0);
+ return -EHWPOISON;
+ }
+ return 0;
}
/*
@@ -2870,7 +2846,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
@@ -2881,7 +2857,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
* and update local tlb only
*/
update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ ret = -EAGAIN;
goto pte_unlock;
}
@@ -2906,7 +2882,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ ret = -EAGAIN;
goto pte_unlock;
}
@@ -2925,7 +2901,7 @@ warn:
}
}
- ret = true;
+ ret = 0;
pte_unlock:
if (locked)
@@ -3097,6 +3073,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
+ int ret;
delayacct_wpcopy_start();
@@ -3114,20 +3091,23 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (!new_page)
goto oom;
- if (!__wp_page_copy_user(new_page, old_page, vmf)) {
+ ret = __wp_page_copy_user(new_page, old_page, vmf);
+ if (ret) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
* the same address and we will handle the fault
* from the second attempt.
+ * The -EHWPOISON case will not be retried.
*/
put_page(new_page);
if (old_page)
put_page(old_page);
delayacct_wpcopy_end();
- return 0;
+ return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
}
+ kmsan_copy_page_meta(new_page, old_page);
}
if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
@@ -3148,12 +3128,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm,
- mm_counter_file(old_page));
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ dec_mm_counter(mm, mm_counter_file(old_page));
+ inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
@@ -3234,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
delayacct_wpcopy_end();
- return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
+ return 0;
oom_free_new:
put_page(new_page);
oom:
@@ -3298,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return finish_mkwrite_fault(vmf);
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
+ return 0;
}
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
- vm_fault_t ret = VM_FAULT_WRITE;
+ vm_fault_t ret = 0;
get_page(vmf->page);
@@ -3362,9 +3341,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
-
- VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
+ struct folio *folio = NULL;
if (likely(!unshare)) {
if (userfaultfd_pte_wp(vma, *vmf->pte)) {
@@ -3382,13 +3359,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
}
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
- if (!vmf->page) {
- if (unlikely(unshare)) {
- /* No anonymous page -> nothing to do. */
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ /*
+ * Shared mapping: we are guaranteed to have VM_WRITE and
+ * FAULT_FLAG_WRITE set at this point.
+ */
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -3396,84 +3372,76 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
+ if (!vmf->page)
return wp_pfn_shared(vmf);
-
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return wp_page_copy(vmf);
+ return wp_page_shared(vmf);
}
+ if (vmf->page)
+ folio = page_folio(vmf->page);
+
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Private mapping: create an exclusive anonymous page copy if reuse
+ * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
*/
- if (PageAnon(vmf->page)) {
- struct page *page = vmf->page;
-
+ if (folio && folio_test_anon(folio)) {
/*
* If the page is exclusive to this process we must reuse the
* page without further checks.
*/
- if (PageAnonExclusive(page))
+ if (PageAnonExclusive(vmf->page))
goto reuse;
/*
- * We have to verify under page lock: these early checks are
- * just an optimization to avoid locking the page and freeing
+ * We have to verify under folio lock: these early checks are
+ * just an optimization to avoid locking the folio and freeing
* the swapcache if there is little hope that we can reuse.
*
- * PageKsm() doesn't necessarily raise the page refcount.
+ * KSM doesn't necessarily raise the folio refcount.
*/
- if (PageKsm(page) || page_count(page) > 3)
+ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
goto copy;
- if (!PageLRU(page))
+ if (!folio_test_lru(folio))
/*
* Note: We cannot easily detect+handle references from
- * remote LRU pagevecs or references to PageLRU() pages.
+ * remote LRU pagevecs or references to LRU folios.
*/
lru_add_drain();
- if (page_count(page) > 1 + PageSwapCache(page))
+ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
goto copy;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto copy;
- if (PageSwapCache(page))
- try_to_free_swap(page);
- if (PageKsm(page) || page_count(page) != 1) {
- unlock_page(page);
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+ folio_unlock(folio);
goto copy;
}
/*
- * Ok, we've got the only page reference from our mapping
- * and the page is locked, it's dark out, and we're wearing
+ * Ok, we've got the only folio reference from our mapping
+ * and the folio is locked, it's dark out, and we're wearing
* sunglasses. Hit it.
*/
- page_move_anon_rmap(page, vma);
- unlock_page(page);
+ page_move_anon_rmap(vmf->page, vma);
+ folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
- } else if (unshare) {
- /* No anonymous page -> nothing to do. */
- pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- return wp_page_shared(vmf);
}
copy:
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(vmf->page);
+ if (folio)
+ folio_get(folio);
pte_unmap_unlock(vmf->pte, vmf->ptl);
#ifdef CONFIG_KSM
- if (PageKsm(vmf->page))
+ if (folio && folio_test_ksm(folio))
count_vm_event(COW_KSM);
#endif
return wp_page_copy(vmf);
@@ -3612,11 +3580,11 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
- if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+ if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
return VM_FAULT_RETRY;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3626,23 +3594,23 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
- restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+ restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- unlock_page(page);
+ folio_unlock(folio);
mmu_notifier_invalidate_range_end(&range);
return 0;
}
-static inline bool should_try_to_free_swap(struct page *page,
+static inline bool should_try_to_free_swap(struct folio *folio,
struct vm_area_struct *vma,
unsigned int fault_flags)
{
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
return false;
- if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
- PageMlocked(page))
+ if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
+ folio_test_mlocked(folio))
return true;
/*
* If we want to map a page that's in the swapcache writable, we
@@ -3650,8 +3618,8 @@ static inline bool should_try_to_free_swap(struct page *page,
* user. Try freeing the swapcache to get rid of the swapcache
* reference only in case it's likely that we'll be the exlusive user.
*/
- return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
- page_count(page) == 2;
+ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
+ folio_ref_count(folio) == 2;
}
static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -3693,11 +3661,14 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
unsigned long marker = pte_marker_get(entry);
/*
- * PTE markers should always be with file-backed memories, and the
- * marker should never be empty. If anything weird happened, the best
- * thing to do is to kill the process along with its mm.
+ * PTE markers should never be empty. If anything weird happened,
+ * the best thing to do is to kill the process along with its mm.
*/
- if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
+ if (WARN_ON_ONCE(!marker))
+ return VM_FAULT_SIGBUS;
+
+ /* Higher priority than uffd-wp when data corrupted */
+ if (marker & PTE_MARKER_SWAPIN_ERROR)
return VM_FAULT_SIGBUS;
if (pte_marker_entry_uffd_wp(entry))
@@ -3718,7 +3689,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *swapcache;
+ struct folio *swapcache, *folio = NULL;
+ struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
bool exclusive = false;
@@ -3741,11 +3713,23 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+
+ /*
+ * Get a page reference while we know the page can't be
+ * freed.
+ */
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
- } else if (is_swapin_error_entry(entry)) {
- ret = VM_FAULT_SIGBUS;
} else if (is_pte_marker_entry(entry)) {
ret = handle_pte_marker(vmf);
} else {
@@ -3760,21 +3744,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
- page = lookup_swap_cache(entry, vma, vmf->address);
- swapcache = page;
+ folio = swap_cache_get_folio(entry, vma, vmf->address);
+ if (folio)
+ page = folio_file_page(folio, swp_offset(entry));
+ swapcache = folio;
- if (!page) {
+ if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
- if (page) {
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
-
- if (mem_cgroup_swapin_charge_page(page,
- vma->vm_mm, GFP_KERNEL, entry)) {
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
+ vma, vmf->address, false);
+ page = &folio->page;
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ if (mem_cgroup_swapin_charge_folio(folio,
+ vma->vm_mm, GFP_KERNEL,
+ entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -3782,23 +3770,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
- workingset_refault(page_folio(page),
- shadow);
+ workingset_refault(folio, shadow);
- lru_cache_add(page);
+ folio_add_lru(folio);
/* To provide entry to swap_readpage() */
- set_page_private(page, entry.val);
+ folio_set_swap_entry(folio, entry);
swap_readpage(page, true, NULL);
- set_page_private(page, 0);
+ folio->private = NULL;
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
- swapcache = page;
+ if (page)
+ folio = page_folio(page);
+ swapcache = folio;
}
- if (!page) {
+ if (!folio) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
@@ -3823,7 +3812,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+ locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
if (!locked) {
ret |= VM_FAULT_RETRY;
@@ -3832,13 +3821,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (swapcache) {
/*
- * Make sure try_to_free_swap or swapoff did not release the
+ * Make sure folio_free_swap() or swapoff did not release the
* swapcache from under us. The page pin, and pte_same test
* below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not
* changed.
*/
- if (unlikely(!PageSwapCache(page) ||
+ if (unlikely(!folio_test_swapcache(folio) ||
page_private(page) != entry.val))
goto out_page;
@@ -3850,9 +3839,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
- page = swapcache;
goto out_page;
}
+ folio = page_folio(page);
/*
* If we want to map a page that's in the swapcache writable, we
@@ -3860,8 +3849,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner. Try removing the extra reference from the local LRU
* pagevecs if required.
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
- !PageKsm(page) && !PageLRU(page))
+ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
+ !folio_test_ksm(folio) && !folio_test_lru(folio))
lru_add_drain();
}
@@ -3875,7 +3864,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
@@ -3888,26 +3877,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* check after taking the PT lock and making sure that nobody
* concurrently faulted in this page and set PG_anon_exclusive.
*/
- BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
- BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
+ BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
/*
* Check under PT lock (to protect against concurrent fork() sharing
* the swap entry concurrently) for certainly exclusive pages.
*/
- if (!PageKsm(page)) {
+ if (!folio_test_ksm(folio)) {
/*
* Note that pte_swp_exclusive() == false for architectures
* without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
*/
exclusive = pte_swp_exclusive(vmf->orig_pte);
- if (page != swapcache) {
+ if (folio != swapcache) {
/*
* We have a fresh page that is not exposed to the
* swapcache -> certainly exclusive.
*/
exclusive = true;
- } else if (exclusive && PageWriteback(page) &&
+ } else if (exclusive && folio_test_writeback(folio) &&
data_race(si->flags & SWP_STABLE_WRITES)) {
/*
* This is tricky: not all swap backends support
@@ -3937,11 +3926,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* yet.
*/
swap_free(entry);
- if (should_try_to_free_swap(page, vma, vmf->flags))
- try_to_free_swap(page);
+ if (should_try_to_free_swap(folio, vma, vmf->flags))
+ folio_free_swap(folio);
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
/*
@@ -3950,11 +3939,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* exposing them to the swapcache or because the swap entry indicates
* exclusivity.
*/
- if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
+ if (!folio_test_ksm(folio) &&
+ (exclusive || folio_ref_count(folio) == 1)) {
if (vmf->flags & FAULT_FLAG_WRITE) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
}
rmap_flags |= RMAP_EXCLUSIVE;
}
@@ -3968,19 +3957,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->orig_pte = pte;
/* ksm created a completely new copy */
- if (unlikely(page != swapcache && swapcache)) {
+ if (unlikely(folio != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_lru_vma(folio, vma);
} else {
page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
}
- VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
+ VM_BUG_ON(!folio_test_anon(folio) ||
+ (pte_write(pte) && !PageAnonExclusive(page)));
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- unlock_page(page);
- if (page != swapcache && swapcache) {
+ folio_unlock(folio);
+ if (folio != swapcache && swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -3989,8 +3979,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* so that the swap count won't change under a
* parallel locked swapcache.
*/
- unlock_page(swapcache);
- put_page(swapcache);
+ folio_unlock(swapcache);
+ folio_put(swapcache);
}
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -4011,12 +4001,12 @@ out:
out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
- unlock_page(page);
+ folio_unlock(folio);
out_release:
- put_page(page);
- if (page != swapcache && swapcache) {
- unlock_page(swapcache);
- put_page(swapcache);
+ folio_put(folio);
+ if (folio != swapcache && swapcache) {
+ folio_unlock(swapcache);
+ folio_put(swapcache);
}
if (si)
put_swap_device(si);
@@ -4104,7 +4094,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
@@ -4119,7 +4109,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
@@ -4309,11 +4299,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
entry = pte_mkuffd_wp(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
@@ -4386,14 +4376,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- ret = 0;
+
/* Re-check under ptl */
- if (likely(!vmf_pte_changed(vmf)))
+ if (likely(!vmf_pte_changed(vmf))) {
do_set_pte(vmf, page, vmf->address);
- else
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ ret = 0;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
ret = VM_FAULT_NOPAGE;
+ }
- update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
@@ -4677,10 +4673,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = NUMA_NO_NODE;
+ bool writable = false;
int last_cpupid;
int target_nid;
pte_t pte, old_pte;
- bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
@@ -4699,6 +4695,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
+ /*
+ * Detect now whether the PTE could be writable; this information
+ * is only valid while holding the PT lock.
+ */
+ writable = pte_write(pte);
+ if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pte_writable(vma, vmf->address, pte))
+ writable = true;
+
page = vm_normal_page(vma, vmf->address, pte);
if (!page || is_zone_device_page(page))
goto out_map;
@@ -4715,7 +4720,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
- if (!was_writable)
+ if (!writable)
flags |= TNF_NO_GROUP;
/*
@@ -4725,8 +4730,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
- last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(page_nid))
+ last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
if (target_nid == NUMA_NO_NODE) {
@@ -4734,6 +4747,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
goto out_map;
}
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ writable = false;
/* Migrate to the requested node */
if (migrate_misplaced_page(page, vma, target_nid)) {
@@ -4762,7 +4776,7 @@ out_map:
old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
- if (was_writable)
+ if (writable)
pte = pte_mkwrite(pte);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
@@ -4783,6 +4797,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+ vm_fault_t ret;
if (vma_is_anonymous(vmf->vma)) {
if (likely(!unshare) &&
@@ -4790,11 +4805,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
- if (vmf->vma->vm_ops->huge_fault) {
- vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vmf->vma->vm_ops->huge_fault) {
+ ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
}
/* COW or write-notify handled on pte level: split pmd. */
@@ -4820,14 +4837,17 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ vm_fault_t ret;
+
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
goto split;
- if (vmf->vma->vm_ops->huge_fault) {
- vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vmf->vma->vm_ops->huge_fault) {
+ ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
}
split:
/* COW or write-notify not handled on PUD level: split pud.*/
@@ -4985,7 +5005,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- hugepage_vma_check(vma, vm_flags, false, true)) {
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5019,7 +5039,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- hugepage_vma_check(vma, vm_flags, false, true)) {
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5114,6 +5134,51 @@ static inline void mm_account_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+ /* the LRU algorithm doesn't apply to sequential or random reads */
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+ current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
+static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
+ unsigned int *flags)
+{
+ if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
+ if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
+ return VM_FAULT_SIGSEGV;
+ /*
+ * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
+ * just treat it like an ordinary read-fault otherwise.
+ */
+ if (!is_cow_mapping(vma->vm_flags))
+ *flags &= ~FAULT_FLAG_UNSHARE;
+ } else if (*flags & FAULT_FLAG_WRITE) {
+ /* Write faults on read-only mappings are impossible ... */
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
+ return VM_FAULT_SIGSEGV;
+ /* ... and FOLL_FORCE only applies to COW mappings. */
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
+ !is_cow_mapping(vma->vm_flags)))
+ return VM_FAULT_SIGSEGV;
+ }
+ return 0;
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -5130,8 +5195,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
+ ret = sanitize_fault_flags(vma, &flags);
+ if (ret)
+ return ret;
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
@@ -5145,11 +5211,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ lru_gen_enter_fault(vma);
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ lru_gen_exit_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
@@ -5637,11 +5707,11 @@ static void clear_gigantic_page(struct page *page,
unsigned int pages_per_huge_page)
{
int i;
- struct page *p = page;
+ struct page *p;
might_sleep();
- for (i = 0; i < pages_per_huge_page;
- i++, p = mem_map_next(p, page, i)) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ p = nth_page(page, i);
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
@@ -5677,13 +5747,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
struct page *dst_base = dst;
struct page *src_base = src;
- for (i = 0; i < pages_per_huge_page; ) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ dst = nth_page(dst_base, i);
+ src = nth_page(src_base, i);
+
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
}
}
@@ -5730,10 +5799,10 @@ long copy_huge_page_from_user(struct page *dst_page,
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
- struct page *subpage = dst_page;
+ struct page *subpage;
- for (i = 0; i < pages_per_huge_page;
- i++, subpage = mem_map_next(subpage, dst_page, i)) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ subpage = nth_page(dst_page, i);
if (allow_pagefault)
page_kaddr = kmap(subpage);
else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fad6d1f2262a..fd40f7e9f176 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1085,8 +1085,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
*/
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) ||
!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
@@ -1806,8 +1805,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
*/
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) ||
!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
@@ -1940,8 +1938,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0) {
- kswapd_stop(node);
kcompactd_stop(node);
+ kswapd_stop(node);
}
writeback_set_ratelimit();
@@ -1969,11 +1967,10 @@ failed_removal:
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
- int ret = !is_memblock_offlined(mem);
int *nid = arg;
*nid = mem->nid;
- if (unlikely(ret)) {
+ if (unlikely(mem->state != MEM_OFFLINE)) {
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..02c8a712282f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma)
mpol_rebind_policy(vma->vm_policy, new);
mmap_write_unlock(mm);
}
@@ -654,7 +655,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->vma;
+ struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
@@ -669,9 +670,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
/* hole at head side of range */
return -EFAULT;
}
+ next = find_vma(vma->vm_mm, vma->vm_end);
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
- (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ (!next || vma->vm_end < next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;
@@ -785,26 +787,29 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- unsigned long vmstart;
- unsigned long vmend;
- vma = find_vma(mm, start);
- VM_BUG_ON(!vma);
+ prev = mas_prev(&mas, 0);
+ if (unlikely(!prev))
+ mas_set(&mas, start);
+
+ vma = mas_find(&mas, end - 1);
+ if (WARN_ON(!vma))
+ return 0;
- prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
- for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
+ for (; vma; vma = mas_next(&mas, end - 1)) {
+ unsigned long vmstart = max(start, vma->vm_start);
+ unsigned long vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
- continue;
+ goto next;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
@@ -813,6 +818,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto replace;
}
@@ -820,19 +827,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
- replace:
+replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
+next:
+ prev = vma;
}
- out:
+out:
return err;
}
@@ -853,12 +866,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
+ task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
+ task_unlock(current);
mpol_put(new);
goto out;
}
- task_lock(current);
+
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1047,6 +1062,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
+ struct vm_area_struct *vma;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
@@ -1062,8 +1078,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
+ vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
@@ -1193,14 +1210,13 @@ static struct page *new_page(struct page *page, unsigned long start)
struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
+ VMA_ITERATOR(vmi, current->mm, start);
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
- vma = find_vma(current->mm, start);
- while (vma) {
+ for_each_vma(vmi, vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
- vma = vma->vm_next;
}
if (folio_test_hugetlb(src))
@@ -1259,7 +1275,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
@@ -1478,6 +1494,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
unsigned long vmend;
unsigned long end;
int err = -ENOENT;
+ VMA_ITERATOR(vmi, mm, start);
start = untagged_addr(start);
if (start & ~PAGE_MASK)
@@ -1495,7 +1512,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (home_node >= MAX_NUMNODES || !node_online(home_node))
return -EINVAL;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
@@ -1503,9 +1520,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (end == start)
return 0;
mmap_write_lock(mm);
- vma = find_vma(mm, start);
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
-
+ for_each_vma_range(vmi, vma, end) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
new = mpol_dup(vma_policy(vma));
@@ -1525,6 +1540,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
* the home node for vmas we already updated before.
*/
if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
+ mpol_put(new);
err = -EOPNOTSUPP;
break;
}
@@ -1803,7 +1819,7 @@ bool vma_policy_mof(struct vm_area_struct *vma)
return pol->flags & MPOL_F_MOF;
}
-static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
diff --git a/mm/mempool.c b/mm/mempool.c
index 96488b13a1ef..734bcf5afbb7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -57,8 +57,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
static void check_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
- __check_element(pool, element, ksize(element));
+ if (pool->free == mempool_kfree) {
+ __check_element(pool, element, (size_t)pool->pool_data);
+ } else if (pool->free == mempool_free_slab) {
+ __check_element(pool, element, kmem_cache_size(pool->pool_data));
} else if (pool->free == mempool_free_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
@@ -80,8 +82,10 @@ static void __poison_element(void *element, size_t size)
static void poison_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
- __poison_element(element, ksize(element));
+ if (pool->alloc == mempool_kmalloc) {
+ __poison_element(element, (size_t)pool->pool_data);
+ } else if (pool->alloc == mempool_alloc_slab) {
+ __poison_element(element, kmem_cache_size(pool->pool_data));
} else if (pool->alloc == mempool_alloc_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
@@ -111,8 +115,10 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
static void kasan_unpoison_element(mempool_t *pool, void *element)
{
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_unpoison_range(element, __ksize(element));
+ if (pool->alloc == mempool_kmalloc)
+ kasan_unpoison_range(element, (size_t)pool->pool_data);
+ else if (pool->alloc == mempool_alloc_slab)
+ kasan_unpoison_range(element, kmem_cache_size(pool->pool_data));
else if (pool->alloc == mempool_alloc_pages)
kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
false);
diff --git a/mm/memremap.c b/mm/memremap.c
index 58b20c3c300b..08cbf54fe037 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap)
int i;
percpu_ref_kill(&pgmap->ref);
- for (i = 0; i < pgmap->nr_range; i++)
- percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ for (i = 0; i < pgmap->nr_range; i++)
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+
wait_for_completion(&pgmap->done);
for (i = 0; i < pgmap->nr_range; i++)
@@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@@ -330,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
+ params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
break;
@@ -454,7 +460,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
+ if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
@@ -502,11 +508,28 @@ void free_zone_device_page(struct page *page)
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ page->pgmap->type != MEMORY_DEVICE_COHERENT)
+ /*
+ * Reset the page count to 1 to prepare for handing out the page
+ * again.
+ */
+ set_page_count(page, 1);
+ else
+ put_dev_pagemap(page->pgmap);
+}
+
+void zone_device_page_init(struct page *page)
+{
/*
- * Reset the page count to 1 to prepare for handing out the page again.
+ * Drivers shouldn't be allocating pages after calling
+ * memunmap_pages().
*/
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
set_page_count(page, 1);
+ lock_page(page);
}
+EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_page_refs(struct page *page, int refs)
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a1597c92261..a4d3fc65085f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -50,6 +50,7 @@
#include <linux/memory.h>
#include <linux/random.h>
#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlbflush.h>
@@ -73,13 +74,22 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
if (unlikely(!get_page_unless_zero(page)))
goto out;
+ if (unlikely(PageSlab(page)))
+ goto out_putpage;
+ /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
+ smp_rmb();
/*
- * Check PageMovable before holding a PG_lock because page's owner
- * assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grabbing the lock ruins page's owner side.
+ * Check movable flag before taking the page lock because
+ * we use non-atomic bitops on newly allocated page flags so
+ * unconditionally grabbing the lock ruins page's owner side.
*/
if (unlikely(!__PageMovable(page)))
goto out_putpage;
+ /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
+ smp_rmb();
+ if (unlikely(PageSlab(page)))
+ goto out_putpage;
+
/*
* As movable pages are not isolated from LRU lists, concurrent
* compaction threads can race against page migration functions
@@ -198,7 +208,7 @@ static bool remove_migration_pte(struct folio *folio,
#endif
folio_get(folio);
- pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
+ pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_mksoft_dirty(pte);
@@ -206,6 +216,10 @@ static bool remove_migration_pte(struct folio *folio,
* Recheck VMA as permissions can change since migration started
*/
entry = pte_to_swp_entry(*pvmw.pte);
+ if (!is_migration_entry_young(entry))
+ pte = pte_mkold(pte);
+ if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+ pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
@@ -560,6 +574,18 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
* future migrations of this same page.
*/
cpupid = page_cpupid_xchg_last(&folio->page, -1);
+ /*
+ * For memory tiering mode, when migrate between slow and fast
+ * memory node, reset cpupid, because that is used to record
+ * page access time in slow memory node.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
+ bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
+ bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
+
+ if (f_toptier != t_toptier)
+ cpupid = -1;
+ }
page_cpupid_xchg_last(&newfolio->page, cpupid);
folio_migrate_ksm(newfolio, folio);
@@ -608,6 +634,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
* Migration functions
***********************************************************/
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count)
+{
+ int rc;
+
+ BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
+
+ rc = folio_migrate_mapping(mapping, dst, src, extra_count);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ folio_migrate_copy(dst, src);
+ else
+ folio_migrate_flags(dst, src);
+ return MIGRATEPAGE_SUCCESS;
+}
+
/**
* migrate_folio() - Simple folio migration.
* @mapping: The address_space containing the folio.
@@ -623,20 +668,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode)
{
- int rc;
-
- BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
-
- rc = folio_migrate_mapping(mapping, dst, src, 0);
-
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(dst, src);
- else
- folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return migrate_folio_extra(mapping, dst, src, mode, 0);
}
EXPORT_SYMBOL(migrate_folio);
@@ -797,6 +829,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
{
return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
+EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
#endif
int filemap_migrate_folio(struct address_space *mapping,
@@ -976,17 +1009,15 @@ out:
return rc;
}
-static int __unmap_and_move(struct page *page, struct page *newpage,
+static int __unmap_and_move(struct folio *src, struct folio *dst,
int force, enum migrate_mode mode)
{
- struct folio *folio = page_folio(page);
- struct folio *dst = page_folio(newpage);
int rc = -EAGAIN;
bool page_was_mapped = false;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__PageMovable(page);
+ bool is_lru = !__PageMovable(&src->page);
- if (!trylock_page(page)) {
+ if (!folio_trylock(src)) {
if (!force || mode == MIGRATE_ASYNC)
goto out;
@@ -1006,10 +1037,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (current->flags & PF_MEMALLOC)
goto out;
- lock_page(page);
+ folio_lock(src);
}
- if (PageWriteback(page)) {
+ if (folio_test_writeback(src)) {
/*
* Only in the case of a full synchronous migration is it
* necessary to wait for PageWriteback. In the async case,
@@ -1026,39 +1057,39 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
if (!force)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(src);
}
/*
- * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
- * we cannot notice that anon_vma is freed while we migrates a page.
+ * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrate a page.
* This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*
- * Only page_get_anon_vma() understands the subtleties of
+ * Only folio_get_anon_vma() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
* But if we cannot get anon_vma, then we won't need it anyway,
* because that implies that the anon page is no longer mapped
* (and cannot be remapped so long as we hold the page lock).
*/
- if (PageAnon(page) && !PageKsm(page))
- anon_vma = page_get_anon_vma(page);
+ if (folio_test_anon(src) && !folio_test_ksm(src))
+ anon_vma = folio_get_anon_vma(src);
/*
* Block others from accessing the new page when we get around to
* establishing additional references. We are usually the only one
- * holding a reference to newpage at this point. We used to have a BUG
- * here if trylock_page(newpage) fails, but would like to allow for
- * cases where there might be a race with the previous use of newpage.
+ * holding a reference to dst at this point. We used to have a BUG
+ * here if folio_trylock(dst) fails, but would like to allow for
+ * cases where there might be a race with the previous use of dst.
* This is much like races on refcount of oldpage: just don't BUG().
*/
- if (unlikely(!trylock_page(newpage)))
+ if (unlikely(!folio_trylock(dst)))
goto out_unlock;
if (unlikely(!is_lru)) {
- rc = move_to_new_folio(dst, folio, mode);
+ rc = move_to_new_folio(dst, src, mode);
goto out_unlock_both;
}
@@ -1066,7 +1097,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
* and treated as swapcache but it has no rmap yet.
- * Calling try_to_unmap() against a page->mapping==NULL page will
+ * Calling try_to_unmap() against a src->mapping==NULL page will
* trigger a BUG. So handle it here.
* 2. An orphaned page (see truncate_cleanup_page) might have
* fs-private metadata. The page can be picked up due to memory
@@ -1074,133 +1105,134 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* invisible to the vm, so the page can not be migrated. So try to
* free the metadata, so the page can be freed.
*/
- if (!page->mapping) {
- VM_BUG_ON_PAGE(PageAnon(page), page);
- if (page_has_private(page)) {
- try_to_free_buffers(folio);
+ if (!src->mapping) {
+ if (folio_test_private(src)) {
+ try_to_free_buffers(src);
goto out_unlock_both;
}
- } else if (page_mapped(page)) {
+ } else if (folio_mapped(src)) {
/* Establish migration ptes */
- VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
- page);
- try_to_migrate(folio, 0);
+ VM_BUG_ON_FOLIO(folio_test_anon(src) &&
+ !folio_test_ksm(src) && !anon_vma, src);
+ try_to_migrate(src, 0);
page_was_mapped = true;
}
- if (!page_mapped(page))
- rc = move_to_new_folio(dst, folio, mode);
+ if (!folio_mapped(src))
+ rc = move_to_new_folio(dst, src, mode);
/*
- * When successful, push newpage to LRU immediately: so that if it
+ * When successful, push dst to LRU immediately: so that if it
* turns out to be an mlocked page, remove_migration_ptes() will
- * automatically build up the correct newpage->mlock_count for it.
+ * automatically build up the correct dst->mlock_count for it.
*
* We would like to do something similar for the old page, when
* unsuccessful, and other cases when a page has been temporarily
* isolated from the unevictable LRU: but this case is the easiest.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- lru_cache_add(newpage);
+ folio_add_lru(dst);
if (page_was_mapped)
lru_add_drain();
}
if (page_was_mapped)
- remove_migration_ptes(folio,
- rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
+ remove_migration_ptes(src,
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
out_unlock_both:
- unlock_page(newpage);
+ folio_unlock(dst);
out_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
- unlock_page(page);
+ folio_unlock(src);
out:
/*
- * If migration is successful, decrease refcount of the newpage,
+ * If migration is successful, decrease refcount of dst,
* which will not free the page because new page owner increased
* refcounter.
*/
if (rc == MIGRATEPAGE_SUCCESS)
- put_page(newpage);
+ folio_put(dst);
return rc;
}
/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
+ * Obtain the lock on folio, remove all ptes and migrate the folio
+ * to the newly allocated folio in dst.
*/
static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
- unsigned long private, struct page *page,
+ unsigned long private, struct folio *src,
int force, enum migrate_mode mode,
enum migrate_reason reason,
struct list_head *ret)
{
+ struct folio *dst;
int rc = MIGRATEPAGE_SUCCESS;
struct page *newpage = NULL;
- if (!thp_migration_supported() && PageTransHuge(page))
+ if (!thp_migration_supported() && folio_test_transhuge(src))
return -ENOSYS;
- if (page_count(page) == 1) {
- /* Page was freed from under us. So we are done. */
- ClearPageActive(page);
- ClearPageUnevictable(page);
+ if (folio_ref_count(src) == 1) {
+ /* Folio was freed from under us. So we are done. */
+ folio_clear_active(src);
+ folio_clear_unevictable(src);
/* free_pages_prepare() will clear PG_isolated. */
goto out;
}
- newpage = get_new_page(page, private);
+ newpage = get_new_page(&src->page, private);
if (!newpage)
return -ENOMEM;
+ dst = page_folio(newpage);
- newpage->private = 0;
- rc = __unmap_and_move(page, newpage, force, mode);
+ dst->private = NULL;
+ rc = __unmap_and_move(src, dst, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
- set_page_owner_migrate_reason(newpage, reason);
+ set_page_owner_migrate_reason(&dst->page, reason);
out:
if (rc != -EAGAIN) {
/*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
+ * A folio that has been migrated has all references
+ * removed and will be freed. A folio that has not been
* migrated will have kept its references and be restored.
*/
- list_del(&page->lru);
+ list_del(&src->lru);
}
/*
* If migration is successful, releases reference grabbed during
- * isolation. Otherwise, restore the page to right list unless
+ * isolation. Otherwise, restore the folio to right list unless
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
/*
- * Compaction can migrate also non-LRU pages which are
+ * Compaction can migrate also non-LRU folios which are
* not accounted to NR_ISOLATED_*. They can be recognized
- * as __PageMovable
+ * as __folio_test_movable
*/
- if (likely(!__PageMovable(page)))
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -thp_nr_pages(page));
+ if (likely(!__folio_test_movable(src)))
+ mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+ folio_is_file_lru(src), -folio_nr_pages(src));
if (reason != MR_MEMORY_FAILURE)
/*
- * We release the page in page_handle_poison.
+ * We release the folio in page_handle_poison.
*/
- put_page(page);
+ folio_put(src);
} else {
if (rc != -EAGAIN)
- list_add_tail(&page->lru, ret);
+ list_add_tail(&src->lru, ret);
if (put_new_page)
- put_new_page(newpage, private);
+ put_new_page(&dst->page, private);
else
- put_page(newpage);
+ folio_put(dst);
}
return rc;
@@ -1244,12 +1276,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* tables or check whether the hugepage is pmd-based or not before
* kicking migration.
*/
- if (!hugepage_migration_supported(page_hstate(hpage))) {
- list_move_tail(&hpage->lru, ret);
+ if (!hugepage_migration_supported(page_hstate(hpage)))
return -ENOSYS;
- }
- if (page_count(hpage) == 1) {
+ if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
putback_active_hugepage(hpage);
return MIGRATEPAGE_SUCCESS;
@@ -1260,7 +1290,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOMEM;
dst = page_folio(new_hpage);
- if (!trylock_page(hpage)) {
+ if (!folio_trylock(src)) {
if (!force)
goto out;
switch (mode) {
@@ -1270,29 +1300,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
default:
goto out;
}
- lock_page(hpage);
+ folio_lock(src);
}
/*
* Check for pages which are in the process of being freed. Without
- * page_mapping() set, hugetlbfs specific move page routine will not
+ * folio_mapping() set, hugetlbfs specific move page routine will not
* be called and we could leak usage counts for subpools.
*/
- if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
+ if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
rc = -EBUSY;
goto out_unlock;
}
- if (PageAnon(hpage))
- anon_vma = page_get_anon_vma(hpage);
+ if (folio_test_anon(src))
+ anon_vma = folio_get_anon_vma(src);
- if (unlikely(!trylock_page(new_hpage)))
+ if (unlikely(!folio_trylock(dst)))
goto put_anon;
- if (page_mapped(hpage)) {
+ if (folio_mapped(src)) {
enum ttu_flags ttu = 0;
- if (!PageAnon(hpage)) {
+ if (!folio_test_anon(src)) {
/*
* In shared mappings, try_to_unmap could potentially
* call huge_pmd_unshare. Because of this, take
@@ -1313,7 +1343,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
i_mmap_unlock_write(mapping);
}
- if (!page_mapped(hpage))
+ if (!folio_mapped(src))
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
@@ -1321,24 +1351,24 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
unlock_put_anon:
- unlock_page(new_hpage);
+ folio_unlock(dst);
put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
if (rc == MIGRATEPAGE_SUCCESS) {
- move_hugetlb_state(hpage, new_hpage, reason);
+ move_hugetlb_state(src, dst, reason);
put_new_page = NULL;
}
out_unlock:
- unlock_page(hpage);
+ folio_unlock(src);
out:
if (rc == MIGRATEPAGE_SUCCESS)
putback_active_hugepage(hpage);
else if (rc != -EAGAIN)
- list_move_tail(&hpage->lru, ret);
+ list_move_tail(&src->lru, ret);
/*
* If migration was not successful and there's a freeing callback, use
@@ -1353,210 +1383,248 @@ out:
return rc;
}
-static inline int try_split_thp(struct page *page, struct page **page2,
- struct list_head *from)
+static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
{
- int rc = 0;
+ int rc;
- lock_page(page);
- rc = split_huge_page_to_list(page, from);
- unlock_page(page);
+ folio_lock(folio);
+ rc = split_folio_to_list(folio, split_folios);
+ folio_unlock(folio);
if (!rc)
- list_safe_reset_next(page, *page2, lru);
+ list_move_tail(&folio->lru, split_folios);
return rc;
}
/*
- * migrate_pages - migrate the pages specified in a list, to the free pages
+ * migrate_pages - migrate the folios specified in a list, to the free folios
* supplied as the target for the page migration
*
- * @from: The list of pages to be migrated.
- * @get_new_page: The function used to allocate free pages to be used
- * as the target of the page migration.
- * @put_new_page: The function used to free target pages if migration
+ * @from: The list of folios to be migrated.
+ * @get_new_page: The function used to allocate free folios to be used
+ * as the target of the folio migration.
+ * @put_new_page: The function used to free target folios if migration
* fails, or NULL if no special handling is necessary.
* @private: Private data to be passed on to get_new_page()
* @mode: The migration mode that specifies the constraints for
- * page migration, if any.
- * @reason: The reason for page migration.
- * @ret_succeeded: Set to the number of normal pages migrated successfully if
+ * folio migration, if any.
+ * @reason: The reason for folio migration.
+ * @ret_succeeded: Set to the number of folios migrated successfully if
* the caller passes a non-NULL pointer.
*
- * The function returns after 10 attempts or if no pages are movable any more
- * because the list has become empty or no retryable pages exist any more.
- * It is caller's responsibility to call putback_movable_pages() to return pages
+ * The function returns after 10 attempts or if no folios are movable any more
+ * because the list has become empty or no retryable folios exist any more.
+ * It is caller's responsibility to call putback_movable_pages() to return folios
* to the LRU or free list only if ret != 0.
*
- * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
- * an error code. The number of THP splits will be considered as the number of
- * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
+ * Returns the number of {normal folio, large folio, hugetlb} that were not
+ * migrated, or an error code. The number of large folio splits will be
+ * considered as the number of non-migrated large folio, no matter how many
+ * split folios of the large folio are migrated successfully.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{
int retry = 1;
+ int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
int nr_failed_pages = 0;
+ int nr_retry_pages = 0;
int nr_succeeded = 0;
int nr_thp_succeeded = 0;
+ int nr_large_failed = 0;
int nr_thp_failed = 0;
int nr_thp_split = 0;
int pass = 0;
+ bool is_large = false;
bool is_thp = false;
- struct page *page;
- struct page *page2;
- int rc, nr_subpages;
- LIST_HEAD(ret_pages);
- LIST_HEAD(thp_split_pages);
+ struct folio *folio, *folio2;
+ int rc, nr_pages;
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(split_folios);
bool nosplit = (reason == MR_NUMA_MISPLACED);
- bool no_subpage_counting = false;
+ bool no_split_folio_counting = false;
trace_mm_migrate_pages_start(mode, reason);
-thp_subpage_migration:
- for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
+split_folio_migration:
+ for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
retry = 0;
+ large_retry = 0;
thp_retry = 0;
+ nr_retry_pages = 0;
- list_for_each_entry_safe(page, page2, from, lru) {
-retry:
+ list_for_each_entry_safe(folio, folio2, from, lru) {
/*
- * THP statistics is based on the source huge page.
- * Capture required information that might get lost
- * during migration.
+ * Large folio statistics is based on the source large
+ * folio. Capture required information that might get
+ * lost during migration.
*/
- is_thp = PageTransHuge(page) && !PageHuge(page);
- nr_subpages = compound_nr(page);
+ is_large = folio_test_large(folio) && !folio_test_hugetlb(folio);
+ is_thp = is_large && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
cond_resched();
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private, page,
- pass > 2, mode, reason,
- &ret_pages);
+ put_new_page, private,
+ &folio->page, pass > 2, mode,
+ reason,
+ &ret_folios);
else
rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode,
- reason, &ret_pages);
+ private, folio, pass > 2, mode,
+ reason, &ret_folios);
/*
* The rules are:
- * Success: non hugetlb page will be freed, hugetlb
- * page will be put back
+ * Success: non hugetlb folio will be freed, hugetlb
+ * folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
- * Other errno: put on ret_pages list then splice to
+ * -ENOSYS: stay on the from list
+ * Other errno: put on ret_folios list then splice to
* from list
*/
switch(rc) {
/*
- * THP migration might be unsupported or the
- * allocation could've failed so we should
- * retry on the same page with the THP split
- * to base pages.
+ * Large folio migration might be unsupported or
+ * the allocation could've failed so we should retry
+ * on the same folio with the large folio split
+ * to normal folios.
*
- * Head page is retried immediately and tail
- * pages are added to the tail of the list so
- * we encounter them after the rest of the list
- * is processed.
+ * Split folios are put in split_folios, and
+ * we will migrate them after the rest of the
+ * list is processed.
*/
case -ENOSYS:
- /* THP migration is unsupported */
- if (is_thp) {
- nr_thp_failed++;
- if (!try_split_thp(page, &page2, &thp_split_pages)) {
- nr_thp_split++;
- goto retry;
+ /* Large folio migration is unsupported */
+ if (is_large) {
+ nr_large_failed++;
+ nr_thp_failed += is_thp;
+ if (!try_split_folio(folio, &split_folios)) {
+ nr_thp_split += is_thp;
+ break;
}
/* Hugetlb migration is unsupported */
- } else if (!no_subpage_counting) {
+ } else if (!no_split_folio_counting) {
nr_failed++;
}
- nr_failed_pages += nr_subpages;
+ nr_failed_pages += nr_pages;
+ list_move_tail(&folio->lru, &ret_folios);
break;
case -ENOMEM:
/*
* When memory is low, don't bother to try to migrate
- * other pages, just exit.
- * THP NUMA faulting doesn't split THP to retry.
+ * other folios, just exit.
*/
- if (is_thp && !nosplit) {
- nr_thp_failed++;
- if (!try_split_thp(page, &page2, &thp_split_pages)) {
- nr_thp_split++;
- goto retry;
+ if (is_large) {
+ nr_large_failed++;
+ nr_thp_failed += is_thp;
+ /* Large folio NUMA faulting doesn't split to retry. */
+ if (!nosplit) {
+ int ret = try_split_folio(folio, &split_folios);
+
+ if (!ret) {
+ nr_thp_split += is_thp;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split large folio to
+ * mitigate the failure of longterm pinning.
+ */
+ large_retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ break;
+ }
}
- } else if (!no_subpage_counting) {
+ } else if (!no_split_folio_counting) {
nr_failed++;
}
- nr_failed_pages += nr_subpages;
+ nr_failed_pages += nr_pages + nr_retry_pages;
/*
- * There might be some subpages of fail-to-migrate THPs
- * left in thp_split_pages list. Move them back to migration
+ * There might be some split folios of fail-to-migrate large
+ * folios left in split_folios list. Move them back to migration
* list so that they could be put back to the right list by
- * the caller otherwise the page refcnt will be leaked.
+ * the caller otherwise the folio refcnt will be leaked.
*/
- list_splice_init(&thp_split_pages, from);
+ list_splice_init(&split_folios, from);
+ /* nr_failed isn't updated for not used */
+ nr_large_failed += large_retry;
nr_thp_failed += thp_retry;
goto out;
case -EAGAIN:
- if (is_thp)
- thp_retry++;
- else
+ if (is_large) {
+ large_retry++;
+ thp_retry += is_thp;
+ } else if (!no_split_folio_counting) {
retry++;
+ }
+ nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
- nr_succeeded += nr_subpages;
- if (is_thp)
- nr_thp_succeeded++;
+ nr_succeeded += nr_pages;
+ nr_thp_succeeded += is_thp;
break;
default:
/*
* Permanent failure (-EBUSY, etc.):
- * unlike -EAGAIN case, the failed page is
- * removed from migration page list and not
+ * unlike -EAGAIN case, the failed folio is
+ * removed from migration folio list and not
* retried in the next outer loop.
*/
- if (is_thp)
- nr_thp_failed++;
- else if (!no_subpage_counting)
+ if (is_large) {
+ nr_large_failed++;
+ nr_thp_failed += is_thp;
+ } else if (!no_split_folio_counting) {
nr_failed++;
+ }
- nr_failed_pages += nr_subpages;
+ nr_failed_pages += nr_pages;
break;
}
}
}
nr_failed += retry;
+ nr_large_failed += large_retry;
nr_thp_failed += thp_retry;
+ nr_failed_pages += nr_retry_pages;
/*
- * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
- * counting in this round, since all subpages of a THP is counted
- * as 1 failure in the first round.
+ * Try to migrate split folios of fail-to-migrate large folios, no
+ * nr_failed counting in this round, since all split folios of a
+ * large folio is counted as 1 failure in the first round.
*/
- if (!list_empty(&thp_split_pages)) {
+ if (!list_empty(&split_folios)) {
/*
- * Move non-migrated pages (after 10 retries) to ret_pages
+ * Move non-migrated folios (after 10 retries) to ret_folios
* to avoid migrating them again.
*/
- list_splice_init(from, &ret_pages);
- list_splice_init(&thp_split_pages, from);
- no_subpage_counting = true;
+ list_splice_init(from, &ret_folios);
+ list_splice_init(&split_folios, from);
+ no_split_folio_counting = true;
retry = 1;
- goto thp_subpage_migration;
+ goto split_folio_migration;
}
- rc = nr_failed + nr_thp_failed;
+ rc = nr_failed + nr_large_failed;
out:
/*
- * Put the permanent failure page back to migration list, they
+ * Put the permanent failure folio back to migration list, they
* will be put back to the right list by the caller.
*/
- list_splice(&ret_pages, from);
+ list_splice(&ret_folios, from);
+
+ /*
+ * Return 0 in case all split folios of fail-to-migrate large folios
+ * are migrated successfully.
+ */
+ if (list_empty(from))
+ rc = 0;
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
@@ -1589,7 +1657,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
nid = folio_nid(folio);
if (folio_test_hugetlb(folio)) {
- struct hstate *h = page_hstate(&folio->page);
+ struct hstate *h = folio_hstate(folio);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
@@ -1672,9 +1740,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
err = -ENOENT;
- if (!page || is_zone_device_page(page))
+ if (!page)
goto out;
+ if (is_zone_device_page(page))
+ goto out_putpage;
+
err = 0;
if (page_to_nid(page) == node)
goto out_putpage;
@@ -1735,7 +1806,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
* well.
*/
if (err > 0)
- err += nr_pages - i - 1;
+ err += nr_pages - i;
return err;
}
return store_status(status, start, node, i - start);
@@ -1821,8 +1892,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
err = move_pages_and_store_status(mm, current_node, &pagelist,
status, start, i, nr_pages);
- if (err)
+ if (err) {
+ /* We have accounted for page i */
+ if (err > 0)
+ err--;
goto out;
+ }
current_node = NUMA_NO_NODE;
}
out_flush:
@@ -1863,12 +1938,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (IS_ERR(page))
goto set_status;
- if (page && !is_zone_device_page(page)) {
+ err = -ENOENT;
+ if (!page)
+ goto set_status;
+
+ if (!is_zone_device_page(page))
err = page_to_nid(page);
- put_page(page);
- } else {
- err = -ENOENT;
- }
+
+ put_page(page);
set_status:
*status = err;
@@ -2170,456 +2247,4 @@ out:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets. Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node. The
- * CPUs are placed in the node with the "fast" memory. The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- * Socket A: 0, 1, 2
- * Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1. When Node 1 fills up, it should be migrated to
- * Node 2. The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- * 0 -> 1 -> 2 -> stop
- * 3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
- * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
- * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
- * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
- * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
- * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
- *
- * Moreover some systems may have multiple slow memory nodes.
- * Suppose a system has one socket with 3 memory nodes, node 0
- * is fast memory type, and node 1/2 both are slow memory
- * type, and the distance between fast memory node and slow
- * memory node is same. So the migration path should be:
- *
- * 0 -> 1/2 -> stop
- *
- * This is represented in the node_demotion[] like this:
- * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
- * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
- * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
- */
-
-/*
- * Writes to this array occur without locking. Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-#define DEFAULT_DEMOTION_TARGET_NODES 15
-
-#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
-#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
-#else
-#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
-#endif
-
-struct demotion_nodes {
- unsigned short nr;
- short nodes[DEMOTION_TARGET_NODES];
-};
-
-static struct demotion_nodes *node_demotion __read_mostly;
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
- struct demotion_nodes *nd;
- unsigned short target_nr, index;
- int target;
-
- if (!node_demotion)
- return NUMA_NO_NODE;
-
- nd = &node_demotion[node];
-
- /*
- * node_demotion[] is updated without excluding this
- * function from running. RCU doesn't provide any
- * compiler barriers, so the READ_ONCE() is required
- * to avoid compiler reordering or read merging.
- *
- * Make sure to use RCU over entire code blocks if
- * node_demotion[] reads need to be consistent.
- */
- rcu_read_lock();
- target_nr = READ_ONCE(nd->nr);
-
- switch (target_nr) {
- case 0:
- target = NUMA_NO_NODE;
- goto out;
- case 1:
- index = 0;
- break;
- default:
- /*
- * If there are multiple target nodes, just select one
- * target node randomly.
- *
- * In addition, we can also use round-robin to select
- * target node, but we should introduce another variable
- * for node_demotion[] to record last selected target node,
- * that may cause cache ping-pong due to the changing of
- * last target node. Or introducing per-cpu data to avoid
- * caching issue, which seems more complicated. So selecting
- * target node randomly seems better until now.
- */
- index = get_random_int() % target_nr;
- break;
- }
-
- target = READ_ONCE(nd->nodes[index]);
-
-out:
- rcu_read_unlock();
- return target;
-}
-
-/* Disable reclaim-based migration. */
-static void __disable_all_migrate_targets(void)
-{
- int node, i;
-
- if (!node_demotion)
- return;
-
- for_each_online_node(node) {
- node_demotion[node].nr = 0;
- for (i = 0; i < DEMOTION_TARGET_NODES; i++)
- node_demotion[node].nodes[i] = NUMA_NO_NODE;
- }
-}
-
-static void disable_all_migrate_targets(void)
-{
- __disable_all_migrate_targets();
-
- /*
- * Ensure that the "disable" is visible across the system.
- * Readers will see either a combination of before+disable
- * state or disable+after. They will never see before and
- * after state together.
- *
- * The before+after state together might have cycles and
- * could cause readers to do things like loop until this
- * function finishes. This ensures they can only see a
- * single "bad" read and would, for instance, only loop
- * once.
- */
- synchronize_rcu();
-}
-
-/*
- * Find an automatic demotion target for 'node'.
- * Failing here is OK. It might just indicate
- * being at the end of a chain.
- */
-static int establish_migrate_target(int node, nodemask_t *used,
- int best_distance)
-{
- int migration_target, index, val;
- struct demotion_nodes *nd;
-
- if (!node_demotion)
- return NUMA_NO_NODE;
-
- nd = &node_demotion[node];
-
- migration_target = find_next_best_node(node, used);
- if (migration_target == NUMA_NO_NODE)
- return NUMA_NO_NODE;
-
- /*
- * If the node has been set a migration target node before,
- * which means it's the best distance between them. Still
- * check if this node can be demoted to other target nodes
- * if they have a same best distance.
- */
- if (best_distance != -1) {
- val = node_distance(node, migration_target);
- if (val > best_distance)
- goto out_clear;
- }
-
- index = nd->nr;
- if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
- "Exceeds maximum demotion target nodes\n"))
- goto out_clear;
-
- nd->nodes[index] = migration_target;
- nd->nr++;
-
- return migration_target;
-out_clear:
- node_clear(migration_target, *used);
- return NUMA_NO_NODE;
-}
-
-/*
- * When memory fills up on a node, memory contents can be
- * automatically migrated to another node instead of
- * discarded at reclaim.
- *
- * Establish a "migration path" which will start at nodes
- * with CPUs and will follow the priorities used to build the
- * page allocator zonelists.
- *
- * The difference here is that cycles must be avoided. If
- * node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0. Also one node can
- * be migrated to multiple nodes if the target nodes all have
- * a same best-distance against the source node.
- *
- * This function can run simultaneously with readers of
- * node_demotion[]. However, it can not run simultaneously
- * with itself. Exclusion is provided by memory hotplug events
- * being single-threaded.
- */
-static void __set_migration_target_nodes(void)
-{
- nodemask_t next_pass;
- nodemask_t this_pass;
- nodemask_t used_targets = NODE_MASK_NONE;
- int node, best_distance;
-
- /*
- * Avoid any oddities like cycles that could occur
- * from changes in the topology. This will leave
- * a momentary gap when migration is disabled.
- */
- disable_all_migrate_targets();
-
- /*
- * Allocations go close to CPUs, first. Assume that
- * the migration path starts at the nodes with CPUs.
- */
- next_pass = node_states[N_CPU];
-again:
- this_pass = next_pass;
- next_pass = NODE_MASK_NONE;
- /*
- * To avoid cycles in the migration "graph", ensure
- * that migration sources are not future targets by
- * setting them in 'used_targets'. Do this only
- * once per pass so that multiple source nodes can
- * share a target node.
- *
- * 'used_targets' will become unavailable in future
- * passes. This limits some opportunities for
- * multiple source nodes to share a destination.
- */
- nodes_or(used_targets, used_targets, this_pass);
-
- for_each_node_mask(node, this_pass) {
- best_distance = -1;
-
- /*
- * Try to set up the migration path for the node, and the target
- * migration nodes can be multiple, so doing a loop to find all
- * the target nodes if they all have a best node distance.
- */
- do {
- int target_node =
- establish_migrate_target(node, &used_targets,
- best_distance);
-
- if (target_node == NUMA_NO_NODE)
- break;
-
- if (best_distance == -1)
- best_distance = node_distance(node, target_node);
-
- /*
- * Visit targets from this pass in the next pass.
- * Eventually, every node will have been part of
- * a pass, and will become set in 'used_targets'.
- */
- node_set(target_node, next_pass);
- } while (1);
- }
- /*
- * 'next_pass' contains nodes which became migration
- * targets in this pass. Make additional passes until
- * no more migrations targets are available.
- */
- if (!nodes_empty(next_pass))
- goto again;
-}
-
-/*
- * For callers that do not hold get_online_mems() already.
- */
-void set_migration_target_nodes(void)
-{
- get_online_mems();
- __set_migration_target_nodes();
- put_online_mems();
-}
-
-/*
- * This leaves migrate-on-reclaim transiently disabled between
- * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
- * whether reclaim-based migration is enabled or not, which
- * ensures that the user can turn reclaim-based migration at
- * any time without needing to recalculate migration targets.
- *
- * These callbacks already hold get_online_mems(). That is why
- * __set_migration_target_nodes() can be used as opposed to
- * set_migration_target_nodes().
- */
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
- unsigned long action, void *_arg)
-{
- struct memory_notify *arg = _arg;
-
- /*
- * Only update the node migration order when a node is
- * changing status, like online->offline. This avoids
- * the overhead of synchronize_rcu() in most cases.
- */
- if (arg->status_change_nid < 0)
- return notifier_from_errno(0);
-
- switch (action) {
- case MEM_GOING_OFFLINE:
- /*
- * Make sure there are not transient states where
- * an offline node is a migration target. This
- * will leave migration disabled until the offline
- * completes and the MEM_OFFLINE case below runs.
- */
- disable_all_migrate_targets();
- break;
- case MEM_OFFLINE:
- case MEM_ONLINE:
- /*
- * Recalculate the target nodes once the node
- * reaches its final state (online or offline).
- */
- __set_migration_target_nodes();
- break;
- case MEM_CANCEL_OFFLINE:
- /*
- * MEM_GOING_OFFLINE disabled all the migration
- * targets. Reenable them.
- */
- __set_migration_target_nodes();
- break;
- case MEM_GOING_ONLINE:
- case MEM_CANCEL_ONLINE:
- break;
- }
-
- return notifier_from_errno(0);
-}
-#endif
-
-void __init migrate_on_reclaim_init(void)
-{
- node_demotion = kcalloc(nr_node_ids,
- sizeof(struct demotion_nodes),
- GFP_KERNEL);
- WARN_ON(!node_demotion);
-#ifdef CONFIG_MEMORY_HOTPLUG
- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
-#endif
- /*
- * At this point, all numa nodes with memory/CPus have their state
- * properly set, so we can build the demotion order now.
- * Let us hold the cpu_hotplug lock just, as we could possibily have
- * CPU hotplug events during boot.
- */
- cpus_read_lock();
- set_migration_target_nodes();
- cpus_read_unlock();
-}
-
-bool numa_demotion_enabled = false;
-
-#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%s\n",
- numa_demotion_enabled ? "true" : "false");
-}
-
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- ssize_t ret;
-
- ret = kstrtobool(buf, &numa_demotion_enabled);
- if (ret)
- return ret;
-
- return count;
-}
-
-static struct kobj_attribute numa_demotion_enabled_attr =
- __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
- numa_demotion_enabled_store);
-
-static struct attribute *numa_attrs[] = {
- &numa_demotion_enabled_attr.attr,
- NULL,
-};
-
-static const struct attribute_group numa_attr_group = {
- .attrs = numa_attrs,
-};
-
-static int __init numa_init_sysfs(void)
-{
- int err;
- struct kobject *numa_kobj;
-
- numa_kobj = kobject_create_and_add("numa", mm_kobj);
- if (!numa_kobj) {
- pr_err("failed to create numa kobject\n");
- return -ENOMEM;
- }
- err = sysfs_create_group(numa_kobj, &numa_attr_group);
- if (err) {
- pr_err("failed to register numa group\n");
- goto delete_obj;
- }
- return 0;
-
-delete_obj:
- kobject_put(numa_kobj);
- return err;
-}
-subsys_initcall(numa_init_sysfs);
-#endif /* CONFIG_SYSFS */
#endif /* CONFIG_NUMA */
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d65476..721b2365dbca 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -185,18 +186,25 @@ again:
get_page(page);
/*
- * Optimize for the common case where page is only mapped once
- * in one process. If we can lock the page, then we can safely
- * set up a special migration page table entry now.
+ * We rely on trylock_page() to avoid deadlock between
+ * concurrent migrations where each is waiting on the others
+ * page lock. If we can't immediately lock the page we fail this
+ * migration as it is only best effort anyway.
+ *
+ * If we can lock the page it's safe to set up a migration entry
+ * now. In the common case where the page is mapped once in a
+ * single process setting up the migration entry now is an
+ * optimisation to avoid walking the rmap later with
+ * try_to_migrate().
*/
if (trylock_page(page)) {
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +214,15 @@ again:
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
@@ -221,6 +233,12 @@ again:
else
entry = make_readable_migration_entry(
page_to_pfn(page));
+ if (pte_present(pte)) {
+ if (pte_young(pte))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pte))
+ entry = make_migration_entry_dirty(entry);
+ }
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
@@ -254,13 +272,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
@@ -306,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* folio_migrate_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
-static bool migrate_vma_check_page(struct page *page)
+static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
{
/*
* One extra ref because caller holds an extra reference, either from
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
* a device page.
*/
- int extra = 1;
+ int extra = 1 + (page == fault_page);
/*
* FIXME support THP (transparent huge page), it is bit more complex to
@@ -338,30 +357,28 @@ static bool migrate_vma_check_page(struct page *page)
}
/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
- * special migration pte entry and check if it has been pinned. Pinned pages are
- * restored because we cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
+ * Unmaps pages for migration. Returns number of source pfns marked as
+ * migrating.
*/
-static void migrate_vma_unmap(struct migrate_vma *migrate)
+static unsigned long migrate_device_unmap(unsigned long *src_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
- const unsigned long npages = migrate->npages;
unsigned long i, restore = 0;
bool allow_drain = true;
+ unsigned long unmapped = 0;
lru_add_drain();
for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
- if (!page)
+ if (!page) {
+ if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
+ unmapped++;
continue;
+ }
/* ZONE_DEVICE pages are not on LRU */
if (!is_zone_device_page(page)) {
@@ -372,8 +389,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
}
if (isolate_lru_page(page)) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
@@ -386,34 +402,55 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
if (folio_mapped(folio))
try_to_migrate(folio, 0);
- if (page_mapped(page) || !migrate_vma_check_page(page)) {
+ if (page_mapped(page) ||
+ !migrate_vma_check_page(page, fault_page)) {
if (!is_zone_device_page(page)) {
get_page(page);
putback_lru_page(page);
}
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
+
+ unmapped++;
}
for (i = 0; i < npages && restore; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
folio = page_folio(page);
remove_migration_ptes(folio, folio, false);
- migrate->src[i] = 0;
+ src_pfns[i] = 0;
folio_unlock(folio);
folio_put(folio);
restore--;
}
+
+ return unmapped;
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages,
+ migrate->fault_page);
}
/**
@@ -498,6 +535,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
+ if (args->fault_page && !is_device_private_page(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -658,42 +697,38 @@ abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/**
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
- *
- * This migrates struct page meta-data from source struct page to destination
- * struct page. This effectively finishes the migration from source page to the
- * destination page.
- */
-void migrate_vma_pages(struct migrate_vma *migrate)
+static void __migrate_device_pages(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages,
+ struct migrate_vma *migrate)
{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
struct mmu_notifier_range range;
- unsigned long addr, i;
+ unsigned long i;
bool notified = false;
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
int r;
if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
if (!page) {
+ unsigned long addr;
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
/*
* The only time there is no vma is when called from
* migrate_device_coherent_page(). However this isn't
* called if the page could not be unmapped.
*/
- VM_BUG_ON(!migrate->vma);
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ VM_BUG_ON(!migrate);
+ addr = migrate->start + i*PAGE_SIZE;
if (!notified) {
notified = true;
@@ -704,7 +739,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i]);
+ &src_pfns[i]);
continue;
}
@@ -717,21 +752,26 @@ void migrate_vma_pages(struct migrate_vma *migrate)
* device private or coherent memory.
*/
if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (is_zone_device_page(newpage)) {
/*
* Other types of ZONE_DEVICE page are not supported.
*/
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- r = migrate_folio(mapping, page_folio(newpage),
- page_folio(page), MIGRATE_SYNC_NO_COPY);
+ if (migrate && migrate->fault_page == page)
+ r = migrate_folio_extra(mapping, page_folio(newpage),
+ page_folio(page),
+ MIGRATE_SYNC_NO_COPY, 1);
+ else
+ r = migrate_folio(mapping, page_folio(newpage),
+ page_folio(page), MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
/*
@@ -742,28 +782,56 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (notified)
mmu_notifier_invalidate_range_only_end(&range);
}
-EXPORT_SYMBOL(migrate_vma_pages);
/**
- * migrate_vma_finalize() - restore CPU page table entry
+ * migrate_device_pages() - migrate meta-data from src page to dst page
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Equivalent to migrate_vma_pages(). This is called to migrate struct page
+ * meta-data from source struct page to destination.
+ */
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages)
+{
+ __migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
+}
+EXPORT_SYMBOL(migrate_device_pages);
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
- * This replaces the special migration pte entry with either a mapping to the
- * new page if migration was successful for that page, or to the original page
- * otherwise.
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
*
- * This also unlocks the pages and puts them back on the lru, or drops the extra
- * refcount, for device pages.
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
*/
-void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
{
- const unsigned long npages = migrate->npages;
unsigned long i;
for (i = 0; i < npages; i++) {
struct folio *dst, *src;
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
if (!page) {
if (newpage) {
@@ -773,7 +841,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
continue;
}
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (newpage) {
unlock_page(newpage);
put_page(newpage);
@@ -800,8 +868,72 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
}
+EXPORT_SYMBOL(migrate_device_finalize);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+}
EXPORT_SYMBOL(migrate_vma_finalize);
+/**
+ * migrate_device_range() - migrate device private pfns to normal memory.
+ * @src_pfns: array large enough to hold migrating source device private pfns.
+ * @start: starting pfn in the range to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
+ * instead of looking up pages based on virtual address mappings a range of
+ * device pfns that should be migrated to system memory is used instead.
+ *
+ * This is useful when a driver needs to free device memory but doesn't know the
+ * virtual mappings of every page that may be in device memory. For example this
+ * is often the case when a driver is being unloaded or unbound from a device.
+ *
+ * Like migrate_vma_setup() this function will take a reference and lock any
+ * migrating pages that aren't free before unmapping them. Drivers may then
+ * allocate destination pages and start copying data from the device to CPU
+ * memory before calling migrate_device_pages().
+ */
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!get_page_unless_zero(page)) {
+ src_pfns[i] = 0;
+ continue;
+ }
+
+ if (!trylock_page(page)) {
+ src_pfns[i] = 0;
+ put_page(page);
+ continue;
+ }
+
+ src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ }
+
+ migrate_device_unmap(src_pfns, npages, NULL);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
+
/*
* Migrate a device coherent page back to normal memory. The caller should have
* a reference on page which will be copied to the new page if migration is
@@ -810,25 +942,19 @@ EXPORT_SYMBOL(migrate_vma_finalize);
int migrate_device_coherent_page(struct page *page)
{
unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma args;
struct page *dpage;
WARN_ON_ONCE(PageCompound(page));
lock_page(page);
src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
- args.src = &src_pfn;
- args.dst = &dst_pfn;
- args.cpages = 1;
- args.npages = 1;
- args.vma = NULL;
/*
* We don't have a VMA and don't need to walk the page tables to find
* the source page. So call migrate_vma_unmap() directly to unmap the
* page as migrate_vma_setup() will fail if args.vma == NULL.
*/
- migrate_vma_unmap(&args);
+ migrate_device_unmap(&src_pfn, 1, NULL);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
return -EBUSY;
@@ -838,10 +964,10 @@ int migrate_device_coherent_page(struct page *page)
dst_pfn = migrate_pfn(page_to_pfn(dpage));
}
- migrate_vma_pages(&args);
+ migrate_device_pages(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
copy_highpage(dpage, page);
- migrate_vma_finalize(&args);
+ migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
return 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index fa200c14185f..a085a2aeabd8 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -52,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
- struct page *page;
+ struct folio *folio;
/*
* When tmpfs swaps out a page from a file, any process mapping that
@@ -60,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_incore_page(mapping, index);
- if (page) {
- present = PageUptodate(page);
- put_page(page);
+ folio = filemap_get_incore_folio(mapping, index);
+ if (folio) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
}
return present;
@@ -190,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
unsigned long end;
int err;
- vma = find_vma(current->mm, addr);
- if (!vma || addr < vma->vm_start)
+ vma = vma_lookup(current->mm, addr);
+ if (!vma)
return -ENOMEM;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
if (!can_do_mincore(vma)) {
diff --git a/mm/mlock.c b/mm/mlock.c
index b14e929084cc..7032f6dd0ce1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
int error;
+ MA_STATE(mas, &current->mm->mm_mt, start, start);
VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current->mm, start);
- if (!vma || vma->vm_start > start)
+ vma = mas_walk(&mas);
+ if (!vma)
return -ENOMEM;
- prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
+ else
+ prev = mas_prev(&mas, 0);
for (nstart = start ; ; ) {
vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = find_vma(prev->vm_mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -526,24 +528,21 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long count = 0;
+ unsigned long end;
+ VMA_ITERATOR(vmi, mm, start);
- if (mm == NULL)
- mm = current->mm;
+ /* Don't overflow past ULONG_MAX */
+ if (unlikely(ULONG_MAX - len < start))
+ end = ULONG_MAX;
+ else
+ end = start + len;
- vma = find_vma(mm, start);
- if (vma == NULL)
- return 0;
-
- for (; vma ; vma = vma->vm_next) {
- if (start >= vma->vm_end)
- continue;
- if (start + len <= vma->vm_start)
- break;
+ for_each_vma_range(vmi, vma, end) {
if (vma->vm_flags & VM_LOCKED) {
if (start > vma->vm_start)
count -= (start - vma->vm_start);
- if (start + len < vma->vm_end) {
- count += start + len - vma->vm_start;
+ if (end < vma->vm_end) {
+ count += end - vma->vm_start;
break;
}
count += vma->vm_end - vma->vm_start;
@@ -659,6 +658,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
@@ -679,7 +679,7 @@ static int apply_mlockall_flags(int flags)
to_add |= VM_LOCKONFAULT;
}
- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
vm_flags_t newflags;
newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -687,6 +687,7 @@ static int apply_mlockall_flags(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ mas_pause(&mas);
cond_resched();
}
out:
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9ddaf0e1b0ab..c1883362e71d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
@@ -176,16 +178,10 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block compute_batch_nb __meminitdata = {
- .notifier_call = mm_compute_batch_notifier,
- .priority = IPC_CALLBACK_PRI, /* use lowest priority */
-};
-
static int __init mm_compute_batch_init(void)
{
mm_compute_batch(sysctl_overcommit_memory);
- register_hotmemory_notifier(&compute_batch_nb);
-
+ hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
return 0;
}
diff --git a/mm/mm_slot.h b/mm/mm_slot.h
new file mode 100644
index 000000000000..83f18ed1c4bd
--- /dev/null
+++ b/mm/mm_slot.h
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _LINUX_MM_SLOT_H
+#define _LINUX_MM_SLOT_H
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+
+/*
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: link to the mm_slots hash list
+ * @mm_node: link into the mm_slots list
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+#define mm_slot_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+static inline void *mm_slot_alloc(struct kmem_cache *cache)
+{
+ if (!cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(cache, GFP_KERNEL);
+}
+
+static inline void mm_slot_free(struct kmem_cache *cache, void *objp)
+{
+ kmem_cache_free(cache, objp);
+}
+
+#define mm_slot_lookup(_hashtable, _mm) \
+({ \
+ struct mm_slot *tmp_slot, *mm_slot = NULL; \
+ \
+ hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \
+ if (_mm == tmp_slot->mm) { \
+ mm_slot = tmp_slot; \
+ break; \
+ } \
+ \
+ mm_slot; \
+})
+
+#define mm_slot_insert(_hashtable, _mm, _mm_slot) \
+({ \
+ _mm_slot->mm = _mm; \
+ hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \
+})
+
+#endif /* _LINUX_MM_SLOT_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d780f415be3..87d929316d57 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -14,7 +14,6 @@
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -39,7 +38,6 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
-#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -77,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end);
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -132,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
/*
- * Close a vm structure and free it, returning the next.
+ * Close a vm structure and free it.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
@@ -145,20 +142,41 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
- return next;
}
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_check(current->mm, current->mm->def_flags, len);
+}
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -200,35 +218,50 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
+ * do_brk_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
+ /* Search one past newbrk */
+ mas_set(&mas, newbrk);
+ brkvma = mas_find(&mas, oldbrk);
+ if (!brkvma || brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
+ goto success;
+ } else if (!ret)
+ goto success;
+
+ mm->brk = origbrk;
+ goto out;
}
- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ mas_set(&mas, oldbrk);
+ next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
+ brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;
success:
@@ -247,104 +280,45 @@ out:
return origbrk;
}
-static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
-{
- unsigned long gap, prev_end;
-
- /*
- * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
- * allow two stack_guard_gaps between them here, and when choosing
- * an unmapped area; whereas when expanding we only require one.
- * That's a little inconsistent, but keeps the code here simpler.
- */
- gap = vm_start_gap(vma);
- if (vma->vm_prev) {
- prev_end = vm_end_gap(vma->vm_prev);
- if (gap > prev_end)
- gap -= prev_end;
- else
- gap = 0;
- }
- return gap;
-}
-
-#ifdef CONFIG_DEBUG_VM_RB
-static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
-{
- unsigned long max = vma_compute_gap(vma), subtree_gap;
- if (vma->vm_rb.rb_left) {
- subtree_gap = rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- if (vma->vm_rb.rb_right) {
- subtree_gap = rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- return max;
-}
-
-static int browse_rb(struct mm_struct *mm)
-{
- struct rb_root *root = &mm->mm_rb;
- int i = 0, j, bug = 0;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx < prev %lx\n",
- vma->vm_start, prev);
- bug = 1;
- }
- if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx < pend %lx\n",
- vma->vm_start, pend);
- bug = 1;
- }
- if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_start %lx > vm_end %lx\n",
- vma->vm_start, vma->vm_end);
- bug = 1;
- }
- spin_lock(&mm->page_table_lock);
- if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_emerg("free gap %lx, correct %lx\n",
- vma->rb_subtree_gap,
- vma_compute_subtree_gap(vma));
- bug = 1;
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+extern void mt_validate(struct maple_tree *mt);
+extern void mt_dump(const struct maple_tree *mt);
+
+/* Validate the maple tree */
+static void validate_mm_mt(struct mm_struct *mm)
+{
+ struct maple_tree *mt = &mm->mm_mt;
+ struct vm_area_struct *vma_mt;
+
+ MA_STATE(mas, mt, 0, 0);
+
+ mt_validate(&mm->mm_mt);
+ mas_for_each(&mas, vma_mt, ULONG_MAX) {
+ if ((vma_mt->vm_start != mas.index) ||
+ (vma_mt->vm_end - 1 != mas.last)) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma_mt);
+ pr_emerg("mt piv: %p %lu - %lu\n", vma_mt,
+ mas.index, mas.last);
+ pr_emerg("mt vma: %p %lu - %lu\n", vma_mt,
+ vma_mt->vm_start, vma_mt->vm_end);
+
+ mt_dump(mas.tree);
+ if (vma_mt->vm_end != mas.last + 1) {
+ pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n",
+ mm, vma_mt->vm_start, vma_mt->vm_end,
+ mas.index, mas.last);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
+ if (vma_mt->vm_start != mas.index) {
+ pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n",
+ mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
}
- spin_unlock(&mm->page_table_lock);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd))
- j++;
- if (i != j) {
- pr_emerg("backwards %d, forwards %d\n", j, i);
- bug = 1;
- }
- return bug ? -1 : i;
-}
-
-static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
-{
- struct rb_node *nd;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- VM_BUG_ON_VMA(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
- vma);
}
}
@@ -352,10 +326,13 @@ static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- unsigned long highest_address = 0;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
- while (vma) {
+ validate_mm_mt(mm);
+
+ mas_for_each(&mas, vma, ULONG_MAX) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
@@ -365,93 +342,20 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
-
- highest_address = vm_end_gap(vma);
- vma = vma->vm_next;
+#endif
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
- bug = 1;
- }
- if (highest_address != mm->highest_vm_end) {
- pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
- bug = 1;
- }
- i = browse_rb(mm);
- if (i != mm->map_count) {
- if (i != -1)
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
-#else
-#define validate_mm_rb(root, ignore) do { } while (0)
-#define validate_mm(mm) do { } while (0)
-#endif
-
-RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
- struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_gap)
-
-/*
- * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
- * vma->vm_prev->vm_end values changed, without modifying the vma's position
- * in the rbtree.
- */
-static void vma_gap_update(struct vm_area_struct *vma)
-{
- /*
- * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
- * a callback function that does exactly what we want.
- */
- vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
-}
-
-static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-{
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
- struct vm_area_struct *ignore)
-{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of
- *
- * a. the "next" vma being erased if next->vm_start was reduced in
- * __vma_adjust() -> __vma_unlink()
- * b. the vma being erased in detach_vmas_to_be_unmapped() ->
- * vma_rb_erase()
- */
- validate_mm_rb(root, ignore);
- __vma_rb_erase(vma, root);
-}
-
-static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- vma_rb_erase_ignore(vma, root, vma);
-}
+#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
+#define validate_mm_mt(root) do { } while (0)
+#define validate_mm(mm) do { } while (0)
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -485,208 +389,220 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static int find_vma_links(struct mm_struct *mm, unsigned long addr,
- unsigned long end, struct vm_area_struct **pprev,
- struct rb_node ***rb_link, struct rb_node **rb_parent)
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
{
- struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+ VMA_ITERATOR(vmi, mm, addr);
+ struct vm_area_struct *vma;
+ unsigned long nr_pages = 0;
- mmap_assert_locked(mm);
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
+ for_each_vma_range(vmi, vma, end) {
+ unsigned long vm_start = max(addr, vma->vm_start);
+ unsigned long vm_end = min(end, vma->vm_end);
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
+ nr_pages += PHYS_PFN(vm_end - vm_start);
+ }
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+ return nr_pages;
+}
- if (vma_tmp->vm_end > addr) {
- /* Fail if an existing vma overlaps the area */
- if (vma_tmp->vm_start < end)
- return -ENOMEM;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(mapping);
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
- return 0;
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
}
/*
- * vma_next() - Get the next VMA.
- * @mm: The mm_struct.
- * @vma: The current vma.
+ * vma_mas_store() - Store a VMA in the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * If @vma is NULL, return the first vma in the mm.
+ * Efficient way to store a VMA in the maple tree when the @mas has already
+ * walked to the correct location.
*
- * Returns: The next VMA after @vma.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
- struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- if (!vma)
- return mm->mmap;
-
- return vma->vm_next;
+ trace_vma_store(mas->tree, vma);
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
}
/*
- * munmap_vma_range() - munmap VMAs that overlap a range.
- * @mm: The mm struct
- * @start: The start of the range.
- * @len: The length of the range.
- * @pprev: pointer to the pointer that will be set to previous vm_area_struct
- * @rb_link: the rb_node
- * @rb_parent: the parent rb_node
- *
- * Find all the vm_area_struct that overlap from @start to
- * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ * vma_mas_remove() - Remove a VMA from the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * Returns: -ENOMEM on munmap failure or 0 on success.
+ * Efficient way to remove a VMA from the maple tree when the @mas has already
+ * been established and points to the correct location.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline int
-munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
- struct vm_area_struct **pprev, struct rb_node ***link,
- struct rb_node **parent, struct list_head *uf)
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
{
-
- while (find_vma_links(mm, start, start + len, pprev, link, parent))
- if (do_munmap(mm, start, len, uf))
- return -ENOMEM;
-
- return 0;
+ trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1);
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
}
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+
+/*
+ * vma_mas_szero() - Set a given range to zero. Used when modifying a
+ * vm_area_struct start or end.
+ *
+ * @mas: The maple tree ma_state
+ * @start: The start address to zero
+ * @end: The end address to zero.
+ */
+static inline void vma_mas_szero(struct ma_state *mas, unsigned long start,
+ unsigned long end)
{
- unsigned long nr_pages = 0;
- struct vm_area_struct *vma;
+ trace_vma_mas_szero(mas->tree, start, end - 1);
+ mas_set_range(mas, start, end - 1);
+ mas_store_prealloc(mas, NULL);
+}
- /* Find first overlapping mapping */
- vma = find_vma_intersection(mm, addr, end);
- if (!vma)
- return 0;
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct address_space *mapping = NULL;
- nr_pages = (min(end, vma->vm_end) -
- max(addr, vma->vm_start)) >> PAGE_SHIFT;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
- /* Iterate over the rest of the overlaps */
- for (vma = vma->vm_next; vma; vma = vma->vm_next) {
- unsigned long overlap_len;
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_start > end)
- break;
+ vma_mas_store(vma, &mas);
- overlap_len = min(end, vma->vm_end) - vma->vm_start;
- nr_pages += overlap_len >> PAGE_SHIFT;
+ if (mapping) {
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
}
- return nr_pages;
+ mm->map_count++;
+ validate_mm(mm);
+ return 0;
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @mas: The maple state
+ * @vma: The vma to expand
+ * @start: The start of the vma
+ * @end: The exclusive end of the vma
+ * @pgoff: The page offset of vma
+ * @next: The current of next vma.
+ *
+ * Expand @vma to @start and @end. Can expand off the start and end. Will
+ * expand over @next if it's different from @vma and @end == @next->vm_end.
+ * Checking if the @vma can expand and merge with @next needs to be handled by
+ * the caller.
+ *
+ * Returns: 0 on success
+ */
+inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
{
- /* Update tracking information for the gap following the new vma. */
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
+ struct rb_root_cached *root = NULL;
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct file *file = vma->vm_file;
+ bool remove_next = false;
- /*
- * vma->vm_prev wasn't known when we followed the rbtree to find the
- * correct insertion point for that vma. As a result, we could not
- * update the vma vm_rb parents rb_subtree_gap values on the way down.
- * So, we first insert the vma with a zero rb_subtree_gap value
- * (to be consistent with what we did on the way down), and then
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
-}
+ if (next && (vma != next) && (end == next->vm_end)) {
+ remove_next = true;
+ if (next->anon_vma && !vma->anon_vma) {
+ int error;
-static void __vma_link_file(struct vm_area_struct *vma)
-{
- struct file *file;
+ anon_vma = next->anon_vma;
+ vma->anon_vma = anon_vma;
+ error = anon_vma_clone(vma, next);
+ if (error)
+ return error;
+ }
+ }
+
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start);
+ /* Only handles expanding */
+ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ goto nomem;
+
+ vma_adjust_trans_huge(vma, start, end, 0);
- file = vma->vm_file;
if (file) {
- struct address_space *mapping = file->f_mapping;
+ mapping = file->f_mapping;
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
+ if (anon_vma) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ if (file) {
flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
+ vma_interval_tree_remove(vma, root);
}
-}
-static void
-__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- __vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
-}
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ /* Note: mas must be pointing to the expanding VMA */
+ vma_mas_store(vma, mas);
-static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- struct address_space *mapping = NULL;
+ if (file) {
+ vma_interval_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
+ }
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
+ /* Expanding over the next vma */
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
}
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- __vma_link_file(vma);
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(anon_vma);
+ }
- if (mapping)
+ if (file) {
i_mmap_unlock_write(mapping);
+ uprobe_mmap(vma);
+ }
- mm->map_count++;
- validate_mm(mm);
-}
-
-/*
- * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the interval tree.
- */
-static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ if (remove_next) {
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+ fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ vm_area_free(next);
+ }
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- BUG();
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- mm->map_count++;
-}
+ validate_mm(mm);
+ return 0;
-static __always_inline void __vma_unlink(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *ignore)
-{
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- __vma_unlink_list(mm, vma);
- /* Kill the cache */
- vmacache_invalidate(mm);
+nomem:
+ return -ENOMEM;
}
/*
@@ -701,18 +617,20 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
+ struct vm_area_struct *next_next = NULL; /* uninit var warning */
+ struct vm_area_struct *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
- bool start_changed = false, end_changed = false;
+ bool vma_changed = false;
long adjust_next = 0;
int remove_next = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct vm_area_struct *exporter = NULL, *importer = NULL;
if (next && !insert) {
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
@@ -741,10 +659,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
+ if (remove_next == 2)
+ next_next = find_vma(mm, next->vm_end);
+
VM_WARN_ON(remove_next == 2 &&
- end != next->vm_next->vm_end);
- /* trim end to next, for case 6 first pass */
- end = next->vm_end;
+ end != next_next->vm_end);
}
exporter = next;
@@ -755,7 +674,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && !next->anon_vma)
- exporter = next->vm_next;
+ exporter = next_next;
} else if (end > next->vm_start) {
/*
@@ -792,9 +711,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
return error;
}
}
-again:
- vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
@@ -804,14 +725,14 @@ again:
uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
- if (insert) {
+ if (insert && insert->vm_file) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
- __vma_link_file(insert);
+ __vma_link_file(insert, insert->vm_file->f_mapping);
}
}
@@ -835,17 +756,37 @@ again:
}
if (start != vma->vm_start) {
+ if ((vma->vm_start < start) &&
+ (!insert || (insert->vm_end != start))) {
+ vma_mas_szero(&mas, vma->vm_start, start);
+ VM_WARN_ON(insert && insert->vm_start > vma->vm_start);
+ } else {
+ vma_changed = true;
+ }
vma->vm_start = start;
- start_changed = true;
}
if (end != vma->vm_end) {
+ if (vma->vm_end > end) {
+ if (!insert || (insert->vm_start != end)) {
+ vma_mas_szero(&mas, end, vma->vm_end);
+ mas_reset(&mas);
+ VM_WARN_ON(insert &&
+ insert->vm_end < vma->vm_end);
+ }
+ } else {
+ vma_changed = true;
+ }
vma->vm_end = end;
- end_changed = true;
}
+
+ if (vma_changed)
+ vma_mas_store(vma, &mas);
+
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next;
next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+ vma_mas_store(next, &mas);
}
if (file) {
@@ -855,42 +796,19 @@ again:
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- if (remove_next != 3)
- __vma_unlink(mm, next, next);
- else
- /*
- * vma is not before next if they've been
- * swapped.
- *
- * pre-swap() next->vm_start was reduced so
- * tell validate_mm_rb to ignore pre-swap()
- * "next" (which is stored in post-swap()
- * "vma").
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next == 2)
+ __remove_shared_vm_struct(next_next, file, mapping);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- __insert_vm_struct(mm, insert);
- } else {
- if (start_changed)
- vma_gap_update(vma);
- if (end_changed) {
- if (!next)
- mm->highest_vm_end = vm_end_gap(vma);
- else if (!adjust_next)
- vma_gap_update(next);
- }
+ mas_reset(&mas);
+ vma_mas_store(insert, &mas);
+ mm->map_count++;
}
if (anon_vma) {
@@ -909,6 +827,7 @@ again:
}
if (remove_next) {
+again:
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
@@ -917,66 +836,24 @@ again:
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
+ if (remove_next != 2)
+ BUG_ON(vma->vm_end < next->vm_end);
vm_area_free(next);
+
/*
* In mprotect's case 6 (see comments on vma_merge),
- * we must remove another next too. It would clutter
- * up the code too much to do both in one go.
+ * we must remove next_next too.
*/
- if (remove_next != 3) {
- /*
- * If "next" was removed and vma->vm_end was
- * expanded (up) over it, in turn
- * "next->vm_prev->vm_end" changed and the
- * "vma->vm_next" gap must be updated.
- */
- next = vma->vm_next;
- } else {
- /*
- * For the scope of the comment "next" and
- * "vma" considered pre-swap(): if "vma" was
- * removed, next->vm_start was expanded (down)
- * over it and the "next" gap must be updated.
- * Because of the swap() the post-swap() "vma"
- * actually points to pre-swap() "next"
- * (post-swap() "next" as opposed is now a
- * dangling pointer).
- */
- next = vma;
- }
if (remove_next == 2) {
remove_next = 1;
- end = next->vm_end;
+ next = next_next;
goto again;
}
- else if (next)
- vma_gap_update(next);
- else {
- /*
- * If remove_next == 2 we obviously can't
- * reach this path.
- *
- * If remove_next == 3 we can't reach this
- * path because pre-swap() next is always not
- * NULL. pre-swap() "next" is not being
- * removed and its next->vm_end is not altered
- * (and furthermore "end" already matches
- * next->vm_end in remove_next == 3).
- *
- * We reach this only in the remove_next == 1
- * case if the "next" vma that was removed was
- * the highest vma of the mm. However in such
- * case next->vm_end == "end" and the extended
- * "vma" has vma->vm_end == next->vm_end so
- * mm->highest_vm_end doesn't need any update
- * in remove_next == 1 case.
- */
- VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- }
}
if (insert && file)
uprobe_mmap(insert);
+ mas_destroy(&mas);
validate_mm(mm);
return 0;
@@ -1128,8 +1005,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- struct vm_area_struct *area, *next;
- int err;
+ struct vm_area_struct *mid, *next, *res;
+ int err = -1;
+ bool merge_prev = false;
+ bool merge_next = false;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -1138,76 +1017,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- next = vma_next(mm, prev);
- area = next;
- if (area && area->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = find_vma(mm, prev ? prev->vm_end : 0);
+ mid = next;
+ if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ next = find_vma(mm, next->vm_end);
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(mid && end > mid->vm_end);
VM_WARN_ON(addr >= end);
- /*
- * Can it merge with the predecessor?
- */
+ /* Can we merge the predecessor? */
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx, anon_name)) {
- /*
- * OK, it can. Can we now merge in the successor as well?
- */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file,
- pgoff+pglen,
- vm_userfaultfd_ctx, anon_name) &&
- is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma, NULL)) {
- /* cases 1, 6 */
- err = __vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL,
- prev);
- } else /* cases 2, 5, 7 */
- err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
- if (err)
- return NULL;
- khugepaged_enter_vma(prev, vm_flags);
- return prev;
+ merge_prev = true;
}
-
- /*
- * Can this new request be merged in front of next?
- */
+ /* Can we merge the successor? */
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx, anon_name)) {
+ merge_next = true;
+ }
+ /* Can we merge both the predecessor and the successor? */
+ if (merge_prev && merge_next &&
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL)) { /* cases 1, 6 */
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL,
+ prev);
+ res = prev;
+ } else if (merge_prev) { /* cases 2, 5, 7 */
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, prev);
+ res = prev;
+ } else if (merge_next) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
- else { /* cases 3, 8 */
- err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
- /*
- * In case 3 area is already equal to next and
- * this is a noop, but in case 8 "area" has
- * been removed and next was expanded over it.
- */
- area = next;
- }
- if (err)
- return NULL;
- khugepaged_enter_vma(area, vm_flags);
- return area;
+ addr, prev->vm_pgoff, NULL, next);
+ else /* cases 3, 8 */
+ err = __vma_adjust(mid, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, next);
+ res = next;
}
- return NULL;
+ /*
+ * Cannot merge with predecessor or successor or error in __vma_adjust?
+ */
+ if (err)
+ return NULL;
+ khugepaged_enter_vma(res, vm_flags);
+ return res;
}
/*
@@ -1275,18 +1139,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
+ struct vm_area_struct *prev, *next;
/* Try next first. */
- if (vma->vm_next) {
- anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ next = mas_walk(&mas);
+ if (next) {
+ anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}
+ prev = mas_prev(&mas, 0);
+ VM_BUG_ON_VMA(prev != vma, vma);
+ prev = mas_prev(&mas, 0);
/* Try prev next. */
- if (vma->vm_prev)
- anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+ if (prev)
+ anon_vma = reusable_anon_vma(prev, prev, vma);
/*
* We might reach here with anon_vma == NULL if we can't find
@@ -1375,6 +1245,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;
+ validate_mm(mm);
*populate = 0;
if (!len)
@@ -1678,388 +1549,63 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev, *merge;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
-
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
-
- if (file) {
- if (vm_flags & VM_SHARED) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto free_vma;
- }
-
- vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
-
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
-
- /* If vm_flags changed after call_mmap(), we should try merge vma again
- * as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (merge) {
- /* ->mmap() can change vma->vm_file and fput the original file. So
- * fput the vma->vm_file here or we would add an extra fput for file
- * and cause general protection fault ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto unmap_writable;
- }
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
- if (file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
- }
-
- vma_link(mm, vma, prev, rb_link, rb_parent);
-
- /*
- * vma_merge() calls khugepaged_enter_vma() either, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
- /* Once vma denies write, undo our temporary denial count */
-unmap_writable:
- if (file && vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
- file = vma->vm_file;
-out:
- perf_event_mmap(vma);
-
- vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vma->vm_flags |= VM_SOFTDIRTY;
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- if (vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
-}
-
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
- /*
- * We implement the search by looking for an rbtree node that
- * immediately follows a suitable gap. That is,
- * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
- * - gap_end = vma->vm_start >= info->low_limit + length;
- * - gap_end - gap_start >= length
- */
+ unsigned long length, gap;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /* Adjust search limits by the desired length */
- if (info->high_limit < length)
+ if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
- high_limit = info->high_limit - length;
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- goto check_highest;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- goto check_highest;
-
- while (true) {
- /* Visit left subtree if it looks promising */
- gap_end = vm_start_gap(vma);
- if (gap_end >= low_limit && vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
-check_current:
- /* Check if current node has a suitable gap */
- if (gap_start > high_limit)
- return -ENOMEM;
- if (gap_end >= low_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit right subtree if it looks promising */
- if (vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- goto check_highest;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_left) {
- gap_start = vm_end_gap(vma->vm_prev);
- gap_end = vm_start_gap(vma);
- goto check_current;
- }
- }
- }
-
-check_highest:
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
- if (gap_start > high_limit)
- return -ENOMEM;
-
-found:
- /* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
- gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
-
- VM_BUG_ON(gap_start + info->length > info->high_limit);
- VM_BUG_ON(gap_start + info->length > gap_end);
- return gap_start;
+ gap = mas.index;
+ gap += (info->align_offset - gap) & info->align_mask;
+ return gap;
}
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with * the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ unsigned long length, gap;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /*
- * Adjust search limits by the desired length.
- * See implementation comment at top of unmapped_area().
- */
- gap_end = info->high_limit;
- if (gap_end < length)
- return -ENOMEM;
- high_limit = gap_end - length;
-
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- if (gap_start <= high_limit)
- goto found_highest;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- return -ENOMEM;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
+ if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
- while (true) {
- /* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
- if (gap_start <= high_limit && vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
-check_current:
- /* Check if current node has a suitable gap */
- gap_end = vm_start_gap(vma);
- if (gap_end < low_limit)
- return -ENOMEM;
- if (gap_start <= high_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit left subtree if it looks promising */
- if (vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- return -ENOMEM;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_right) {
- gap_start = vma->vm_prev ?
- vm_end_gap(vma->vm_prev) : 0;
- goto check_current;
- }
- }
- }
-
-found:
- /* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
- gap_end = info->high_limit;
-
-found_highest:
- /* Compute highest gap address at the desired alignment */
- gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
-
- VM_BUG_ON(gap_end < info->low_limit);
- VM_BUG_ON(gap_end < gap_start);
- return gap_end;
+ gap = mas.last + 1 - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ return gap;
}
/*
@@ -2249,58 +1795,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
-/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
+ * start_addr < end_addr.
+ */
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
{
- struct rb_node *rb_node;
- struct vm_area_struct *vma;
+ unsigned long index = start_addr;
mmap_assert_locked(mm);
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *tmp;
-
- tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
- if (tmp->vm_end > addr) {
- vma = tmp;
- if (tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+/**
+ * find_vma() - Find the VMA for a given address, or the next VMA.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ *
+ * Returns: The VMA associated with addr, or the next VMA.
+ * May return %NULL in the case of no VMA at addr or above.
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long index = addr;
- if (vma)
- vmacache_update(addr, vma);
- return vma;
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
-
EXPORT_SYMBOL(find_vma);
-/*
- * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+/**
+ * find_vma_prev() - Find the VMA for a given address, or the next vma and
+ * set %pprev to the previous VMA, if any.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ * @pprev: The pointer to set to the previous VMA
+ *
+ * Note that RCU lock is missing here since the external mmap_lock() is used
+ * instead.
+ *
+ * Returns: The VMA associated with @addr, or the next vma.
+ * May return %NULL in the case of no vma at addr or above.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- vma = find_vma(mm, addr);
- if (vma) {
- *pprev = vma->vm_prev;
- } else {
- struct rb_node *rb_node = rb_last(&mm->mm_rb);
-
- *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
- }
+ vma = mas_walk(&mas);
+ *pprev = mas_prev(&mas, 0);
+ if (!vma)
+ vma = mas_next(&mas, ULONG_MAX);
return vma;
}
@@ -2354,6 +1909,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -2371,16 +1927,21 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
- next = vma->vm_next;
- if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2401,15 +1962,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2417,11 +1976,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2430,7 +1987,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2438,10 +1995,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_downwards(struct vm_area_struct *vma,
- unsigned long address)
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
@@ -2450,7 +2007,7 @@ int expand_downwards(struct vm_area_struct *vma,
return -EPERM;
/* Enforce stack_guard_gap */
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev)) {
@@ -2458,9 +2015,14 @@ int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2481,15 +2043,13 @@ int expand_downwards(struct vm_area_struct *vma,
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2498,8 +2058,9 @@ int expand_downwards(struct vm_area_struct *vma,
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2508,7 +2069,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
@@ -2581,25 +2142,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL_GPL(find_extend_vma);
/*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+ * and do the vma updates.
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
unsigned long nr_accounted = 0;
+ struct vm_area_struct *vma;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
- do {
+ mas_for_each(mas, vma, ULONG_MAX) {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- vma = remove_vma(vma);
- } while (vma);
+ remove_vma(vma);
+ }
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -2609,67 +2171,23 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
+ struct vm_area_struct *next,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end);
- free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ unmap_vmas(&tlb, mt, vma, start, end);
+ free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
}
/*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
-static bool
-detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
-{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
-
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
- vma_rb_erase(vma, &mm->mm_rb);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= vma_pages(vma);
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
- *insertion_point = vma;
- if (vma) {
- vma->vm_prev = prev;
- vma_gap_update(vma);
- } else
- mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
- tail_vma->vm_next = NULL;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
-
- /*
- * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
- * VM_GROWSUP VMA. Such VMAs can change their size under
- * down_read(mmap_lock) and collide with the VMA we are about to unmap.
- */
- if (vma && (vma->vm_flags & VM_GROWSDOWN))
- return false;
- if (prev && (prev->vm_flags & VM_GROWSUP))
- return false;
- return true;
-}
-
-/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
@@ -2678,6 +2196,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct *new;
int err;
+ validate_mm_mt(mm);
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
@@ -2720,6 +2239,9 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!err)
return 0;
+ /* Avoid vm accounting in close() operation */
+ new->vm_start = new->vm_end;
+ new->vm_pgoff = 0;
/* Clean everything up if vma_adjust failed. */
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
@@ -2730,6 +2252,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
mpol_put(vma_policy(new));
out_free_vma:
vm_area_free(new);
+ validate_mm_mt(mm);
return err;
}
@@ -2746,38 +2269,48 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
-/* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf, bool downgrade)
+static inline int munmap_sidetree(struct vm_area_struct *vma,
+ struct ma_state *mas_detach)
{
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
-
- if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
+ mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+ if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+ return -ENOMEM;
- len = PAGE_ALIGN(len);
- end = start + len;
- if (len == 0)
- return -EINVAL;
+ if (vma->vm_flags & VM_LOCKED)
+ vma->vm_mm->locked_vm -= vma_pages(vma);
- /*
- * arch_unmap() might do unmaps itself. It must be called
- * and finish any rbtree manipulation before this code
- * runs and also starts to manipulate the rbtree.
- */
- arch_unmap(mm, start, end);
+ return 0;
+}
- /* Find the first overlapping VMA where start < vma->vm_end */
- vma = find_vma_intersection(mm, start, end);
- if (!vma)
- return 0;
- prev = vma->vm_prev;
+/*
+ * do_mas_align_munmap() - munmap the aligned region from @start to @end.
+ * @mas: The maple_state, ideally set up to alter the correct tree location.
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @downgrade: Set to true to attempt a write downgrade of the mmap_sem
+ *
+ * If @downgrade is true, check return code for potential release of the lock.
+ */
+static int
+do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool downgrade)
+{
+ struct vm_area_struct *prev, *next = NULL;
+ struct maple_tree mt_detach;
+ int count = 0;
+ int error = -ENOMEM;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+ mas->last = end - 1;
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2785,8 +2318,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
+
+ /* Does it split the first one? */
if (start > vma->vm_start) {
- int error;
/*
* Make sure that map_count on return from munmap() will
@@ -2794,22 +2328,61 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
+ goto map_count_exceeded;
+ /*
+ * mas_pause() is not needed since mas->index needs to be set
+ * differently than vma->vm_end anyways.
+ */
error = __split_vma(mm, vma, start, 0);
if (error)
- return error;
- prev = vma;
+ goto start_split_failed;
+
+ mas_set(mas, start);
+ vma = mas_walk(mas);
}
- /* Does it split the last one? */
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = __split_vma(mm, last, end, 1);
+ prev = mas_prev(mas, 0);
+ if (unlikely((!prev)))
+ mas_set(mas, start);
+
+ /*
+ * Detach a range of VMAs from the mm. Using next as a temp variable as
+ * it is always overwritten.
+ */
+ mas_for_each(mas, next, end - 1) {
+ /* Does it split the end? */
+ if (next->vm_end > end) {
+ struct vm_area_struct *split;
+
+ error = __split_vma(mm, next, end, 1);
+ if (error)
+ goto end_split_failed;
+
+ mas_set(mas, end);
+ split = mas_prev(mas, 0);
+ error = munmap_sidetree(split, &mas_detach);
+ if (error)
+ goto munmap_sidetree_failed;
+
+ count++;
+ if (vma == next)
+ vma = split;
+ break;
+ }
+ error = munmap_sidetree(next, &mas_detach);
if (error)
- return error;
+ goto munmap_sidetree_failed;
+
+ count++;
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ BUG_ON(next->vm_start < start);
+ BUG_ON(next->vm_start > end);
+#endif
}
- vma = vma_next(mm, prev);
+
+ if (!next)
+ next = mas_next(mas, ULONG_MAX);
if (unlikely(uf)) {
/*
@@ -2821,30 +2394,372 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ error = userfaultfd_unmap_prep(mm, start, end, uf);
+
if (error)
- return error;
+ goto userfaultfd_error;
+ }
+
+ /* Point of no return */
+ mas_set_range(mas, start, end - 1);
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ /* Make sure no VMAs are about to be lost. */
+ {
+ MA_STATE(test, &mt_detach, start, end - 1);
+ struct vm_area_struct *vma_mas, *vma_test;
+ int test_count = 0;
+
+ rcu_read_lock();
+ vma_test = mas_find(&test, end - 1);
+ mas_for_each(mas, vma_mas, end - 1) {
+ BUG_ON(vma_mas != vma_test);
+ test_count++;
+ vma_test = mas_next(&test, end - 1);
+ }
+ rcu_read_unlock();
+ BUG_ON(count != test_count);
+ mas_set_range(mas, start, end - 1);
+ }
+#endif
+ mas_store_prealloc(mas, NULL);
+ mm->map_count -= count;
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (downgrade) {
+ if (next && (next->vm_flags & VM_GROWSDOWN))
+ downgrade = false;
+ else if (prev && (prev->vm_flags & VM_GROWSUP))
+ downgrade = false;
+ else
+ mmap_write_downgrade(mm);
}
- /* Detach vmas from rbtree */
- if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
- downgrade = false;
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end);
+ /* Statistics and freeing VMAs */
+ mas_set(&mas_detach, start);
+ remove_mt(mm, &mas_detach);
+ __mt_destroy(&mt_detach);
- if (downgrade)
- mmap_write_downgrade(mm);
- unmap_region(mm, vma, prev, start, end);
+ validate_mm(mm);
+ return downgrade ? 1 : 0;
- /* Fix up all other VM information */
- remove_vma_list(mm, vma);
+userfaultfd_error:
+munmap_sidetree_failed:
+end_split_failed:
+ __mt_destroy(&mt_detach);
+start_split_failed:
+map_count_exceeded:
+ mas_destroy(mas);
+ return error;
+}
- return downgrade ? 1 : 0;
+/*
+ * do_mas_munmap() - munmap a given range.
+ * @mas: The maple state
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length of the range to munmap
+ * @uf: The userfaultfd list_head
+ * @downgrade: set to true if the user wants to attempt to write_downgrade the
+ * mmap_sem
+ *
+ * This function takes a @mas that is either pointing to the previous VMA or set
+ * to MA_START and sets it up to remove the mapping(s). The @len will be
+ * aligned and any arch_unmap work will be preformed.
+ *
+ * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
+ */
+int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade)
+{
+ unsigned long end;
+ struct vm_area_struct *vma;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ end = start + PAGE_ALIGN(len);
+ if (end == start)
+ return -EINVAL;
+
+ /* arch_unmap() might do unmaps itself. */
+ arch_unmap(mm, start, end);
+
+ /* Find the first overlapping VMA */
+ vma = mas_find(mas, end - 1);
+ if (!vma)
+ return 0;
+
+ return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade);
}
+/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length to be munmapped.
+ * @uf: The userfaultfd list_head
+ */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- return __do_munmap(mm, start, len, uf, false);
+ MA_STATE(mas, &mm->mm_mt, start, start);
+
+ return do_mas_munmap(&mas, mm, start, len, uf, false);
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *next, *prev, *merge;
+ pgoff_t pglen = len >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ unsigned long end = addr + len;
+ unsigned long merge_start = addr, merge_end = end;
+ pgoff_t vm_pgoff;
+ int error;
+ MA_STATE(mas, &mm->mm_mt, addr, end - 1);
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ nr_pages = count_vma_pages_range(mm, addr, end);
+
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
+ /* Unmap any existing mapping in the area */
+ if (do_mas_munmap(&mas, mm, addr, len, uf, false))
+ return -ENOMEM;
+
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ next = mas_next(&mas, ULONG_MAX);
+ prev = mas_prev(&mas, 0);
+ if (vm_flags & VM_SPECIAL)
+ goto cannot_expand;
+
+ /* Attempt to expand an old mapping */
+ /* Check next */
+ if (next && next->vm_start == end && !vma_policy(next) &&
+ can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
+ NULL_VM_UFFD_CTX, NULL)) {
+ merge_end = next->vm_end;
+ vma = next;
+ vm_pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* Check prev */
+ if (prev && prev->vm_end == addr && !vma_policy(prev) &&
+ (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
+ pgoff, vma->vm_userfaultfd_ctx, NULL) :
+ can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
+ NULL_VM_UFFD_CTX, NULL))) {
+ merge_start = prev->vm_start;
+ vma = prev;
+ vm_pgoff = prev->vm_pgoff;
+ }
+
+
+ /* Actually expand, if possible */
+ if (vma &&
+ !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ khugepaged_enter_vma(vma, vm_flags);
+ goto expanded;
+ }
+
+ mas.index = addr;
+ mas.last = end - 1;
+cannot_expand:
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma->vm_start = addr;
+ vma->vm_end = end;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto free_vma;
+ }
+
+ vma->vm_file = get_file(file);
+ error = call_mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+
+ /*
+ * Expansion is handled above, merging is handled below.
+ * Drivers should not alter the address of the VMA.
+ */
+ if (WARN_ON((addr != vma->vm_start))) {
+ error = -EINVAL;
+ goto close_and_free_vma;
+ }
+ mas_reset(&mas);
+
+ /*
+ * If vm_flags changed after call_mmap(), we should try merge
+ * vma again as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
+ if (merge) {
+ /*
+ * ->mmap() can change vma->vm_file and fput
+ * the original file. So fput the vma->vm_file
+ * here or we would add an extra fput for file
+ * and cause general protection fault
+ * ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
+ vm_flags = vma->vm_flags;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto close_and_free_vma;
+ else if (vma->vm_file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ error = -ENOMEM;
+ if (file)
+ goto close_and_free_vma;
+ else if (vma->vm_file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ vma_mas_store(vma, &mas);
+ mm->map_count++;
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+ vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
+ flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
+ /*
+ * vma_merge() calls khugepaged_enter_vma() either, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
+ /* Once vma denies write, undo our temporary denial count */
+unmap_writable:
+ if (file && vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ file = vma->vm_file;
+expanded:
+ perf_event_mmap(vma);
+
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
+ vma_set_page_prot(vma);
+
+ validate_mm(mm);
+ return addr;
+
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+unmap_and_free_vma:
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
+ if (file && (vm_flags & VM_SHARED))
+ mapping_unmap_writable(file->f_mapping);
+free_vma:
+ vm_area_free(vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ validate_mm(mm);
+ return error;
}
static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
@@ -2852,11 +2767,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, start, start);
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = __do_munmap(mm, start, len, &uf, downgrade);
+ ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2922,11 +2838,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
if (start + size > vma->vm_end) {
- struct vm_area_struct *next;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+ struct vm_area_struct *next, *prev = vma;
- for (next = vma->vm_next; next; next = next->vm_next) {
+ for_each_vma_range(vmi, next, start + size) {
/* hole between vmas ? */
- if (next->vm_start != next->vm_prev->vm_end)
+ if (next->vm_start != prev->vm_end)
goto out;
if (next->vm_file != vma->vm_file)
@@ -2937,6 +2854,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= next->vm_end)
break;
+
+ prev = next;
}
if (!next)
@@ -2966,37 +2885,53 @@ out:
}
/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- struct rb_node **rb_link, *rb_parent;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
-
- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ struct mm_struct *mm = vma->vm_mm;
+ int ret;
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ arch_unmap(mm, newbrk, oldbrk);
+ ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true);
+ validate_mm_mt(mm);
+ return ret;
+}
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
- /* Check against address space limits *after* clearing old maps... */
+ validate_mm_mt(mm);
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -3006,28 +2941,50 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma && vma->vm_end == addr && !vma_policy(vma) &&
+ can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ goto unacct_fail;
+
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ mas_store_prealloc(mas, vma);
+
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma(vma, flags);
+ goto out;
}
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto unacct_fail;
+
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -3035,16 +2992,25 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
+ validate_mm(mm);
return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+unacct_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -3055,13 +3021,31 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0);
+ if (ret)
+ goto munmap_failed;
+
+ vma = mas_prev(&mas, 0);
+ ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
@@ -3077,34 +3061,19 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int count = 0;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Manually reap the mm to free as much memory as possible.
- * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_lock for
- * write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_lock is
- * dropped.
- *
- * Nothing can be holding mm->mmap_lock here and the above call
- * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
- * __oom_reap_task_mm() will not block.
- */
- (void)__oom_reap_task_mm(mm);
- set_bit(MMF_OOM_SKIP, &mm->flags);
- }
-
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
arch_exit_mmap(mm);
- vma = mm->mmap;
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
- mmap_write_unlock(mm);
+ mmap_read_unlock(mm);
return;
}
@@ -3112,19 +3081,37 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
- /* Use -1 here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
+ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX);
+ mmap_read_unlock(mm);
+
+ /*
+ * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
+ * because the memory has been already freed.
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
+ free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
- /* Walk the list again, actually closing and freeing it. */
- while (vma) {
+ /*
+ * Walk the list again, actually closing and freeing it, with preemption
+ * enabled, without holding any MM locks besides the unreachable
+ * mmap_write_lock.
+ */
+ do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- vma = remove_vma(vma);
+ remove_vma(vma);
+ count++;
cond_resched();
- }
- mm->mmap = NULL;
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+
+ BUG_ON(count != mm->map_count);
+
+ trace_exit_mmap(mm);
+ __mt_destroy(&mm->mm_mt);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3135,14 +3122,14 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = vma_pages(vma);
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
return -ENOMEM;
+
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
/*
@@ -3162,7 +3149,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, vma)) {
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3178,9 +3169,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
+ validate_mm_mt(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3190,8 +3181,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ new_vma = find_vma_prev(mm, addr, &prev);
+ if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
+
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
@@ -3232,16 +3225,27 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, new_vma))
+ goto out_vma_link;
*need_rmap_locks = false;
}
+ validate_mm_mt(mm);
return new_vma;
+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
+
+ if (new_vma->vm_file)
+ fput(new_vma->vm_file);
+
+ unlink_anon_vmas(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
+ validate_mm_mt(mm);
return NULL;
}
@@ -3378,6 +3382,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
+ validate_mm_mt(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3400,10 +3405,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
+ validate_mm_mt(mm);
return vma;
out:
vm_area_free(vma);
+ validate_mm_mt(mm);
return ERR_PTR(ret);
}
@@ -3528,12 +3535,13 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
mutex_lock(&mm_all_locks_mutex);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3541,7 +3549,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3549,7 +3558,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3608,11 +3618,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
@@ -3733,13 +3744,9 @@ static int reserve_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}
-static struct notifier_block reserve_mem_nb = {
- .notifier_call = reserve_mem_notifier,
-};
-
static int __meminit init_reserve_notifier(void)
{
- if (register_hotmemory_notifier(&reserve_mem_nb))
+ if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index a71924bd38c0..2b93cf6ac9ae 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -8,6 +8,7 @@
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
@@ -18,6 +19,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
+ /* Limit batching if we have delayed rmaps pending */
+ if (tlb->delayed_rmap && tlb->active != &tlb->local)
+ return false;
+
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
@@ -42,12 +47,46 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
+#ifdef CONFIG_SMP
+static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
+{
+ for (int i = 0; i < batch->nr; i++) {
+ struct encoded_page *enc = batch->encoded_pages[i];
+
+ if (encoded_page_flags(enc)) {
+ struct page *page = encoded_page_ptr(enc);
+ page_remove_rmap(page, vma, false);
+ }
+ }
+}
+
+/**
+ * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
+ * @tlb: the current mmu_gather
+ *
+ * Note that because of how tlb_next_batch() above works, we will
+ * never start multiple new batches with pending delayed rmaps, so
+ * we only need to walk through the current active batch and the
+ * original local one.
+ */
+void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ if (!tlb->delayed_rmap)
+ return;
+
+ tlb_flush_rmap_batch(&tlb->local, vma);
+ if (tlb->active != &tlb->local)
+ tlb_flush_rmap_batch(tlb->active, vma);
+ tlb->delayed_rmap = 0;
+}
+#endif
+
static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- struct page **pages = batch->pages;
+ struct encoded_page **pages = batch->encoded_pages;
do {
/*
@@ -76,7 +115,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
}
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
{
struct mmu_gather_batch *batch;
@@ -91,13 +130,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->pages[batch->nr++] = page;
+ batch->encoded_pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+ VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
return false;
}
@@ -152,7 +191,7 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_sync_one(void)
+void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -176,8 +215,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
-static void tlb_remove_table_sync_one(void) { }
-
static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
__tlb_remove_table_free(batch);
@@ -276,6 +313,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->active = &tlb->local;
tlb->batch_count = 0;
#endif
+ tlb->delayed_rmap = 0;
tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 0ae7571e35ab..68e1511be12d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
* Poison its list head, so that any operations on it would crash.
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bc6bddd156ca..908df12caa26 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -31,6 +31,7 @@
#include <linux/pgtable.h>
#include <linux/sched/sysctl.h>
#include <linux/userfaultfd_k.h>
+#include <linux/memory-tiers.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -38,14 +39,16 @@
#include "internal.h"
-static inline bool can_change_pte_writable(struct vm_area_struct *vma,
- unsigned long addr, pte_t pte)
+bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
{
struct page *page;
- VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte));
+ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
+ return false;
- if (pte_protnone(pte) || !pte_dirty(pte))
+ /* Don't touch entries that are not even readable. */
+ if (pte_protnone(pte))
return false;
/* Do we need write faults for softdirty tracking? */
@@ -58,17 +61,23 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma,
if (!(vma->vm_flags & VM_SHARED)) {
/*
- * We can only special-case on exclusive anonymous pages,
- * because we know that our write-fault handler similarly would
- * map them writable without any additional checks while holding
- * the PT lock.
+ * Writable MAP_PRIVATE mapping: We can only special-case on
+ * exclusive anonymous pages, because we know that our
+ * write-fault handler similarly would map them writable without
+ * any additional checks while holding the PT lock.
*/
page = vm_normal_page(vma, addr, pte);
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
- return false;
+ return page && PageAnon(page) && PageAnonExclusive(page);
}
- return true;
+ /*
+ * Writable MAP_SHARED mapping: "clean" might indicate that the FS still
+ * needs a real write-fault for writenotify
+ * (see vma_wants_writenotify()). If "dirty", the assumption is that the
+ * FS was already notified and we can simply mark the PTE writable
+ * just like the write-fault handler would do.
+ */
+ return pte_dirty(pte);
}
static unsigned long change_pte_range(struct mmu_gather *tlb,
@@ -112,7 +121,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
- bool preserve_write = prot_numa && pte_write(oldpte);
/*
* Avoid trapping faults against the zero or KSM
@@ -121,6 +129,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
if (prot_numa) {
struct page *page;
int nid;
+ bool toptier;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
@@ -150,20 +159,23 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
nid = page_to_nid(page);
if (target_node == nid)
continue;
+ toptier = node_is_toptier(nid);
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(nid))
+ toptier)
continue;
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page,
+ jiffies_to_msecs(jiffies));
}
oldpte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_modify(oldpte, newprot);
- if (preserve_write)
- ptent = pte_mk_savedwrite(ptent);
if (uffd_wp) {
ptent = pte_wrprotect(ptent);
@@ -285,7 +297,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
*/
static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
{
- pmd_t pmdval = pmd_read_atomic(pmd);
+ pmd_t pmdval = pmdp_get_lockless(pmd);
/* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -543,8 +555,8 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
+ unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
- bool try_change_writable;
pgoff_t pgoff;
int error;
@@ -622,20 +634,11 @@ success:
* held in write mode.
*/
vma->vm_flags = newflags;
- /*
- * We want to check manually if we can change individual PTEs writable
- * if we can't do that automatically for all PTEs in a mapping. For
- * private mappings, that's always the case when we have write
- * permissions as we properly have to handle COW.
- */
- if (vma->vm_flags & VM_SHARED)
- try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
- else
- try_change_writable = !!(vma->vm_flags & VM_WRITE);
+ if (vma_wants_manual_pte_write_upgrade(vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
- change_protection(tlb, vma, start, end, vma->vm_page_prot,
- try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0);
+ change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
@@ -669,6 +672,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
struct mmu_gather tlb;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
start = untagged_addr(start);
@@ -700,7 +704,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
- vma = find_vma(current->mm, start);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
error = -ENOMEM;
if (!vma)
goto out;
@@ -726,7 +731,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (start > vma->vm_start)
prev = vma;
else
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
tlb_gather_mmu(&tlb, current->mm);
for (nstart = start ; ; ) {
@@ -745,8 +750,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
* If a permission is not passed to mprotect(), it must be
* cleared from the VMA.
*/
- mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
- VM_FLAGS_CLEAR;
+ mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
newflags = calc_vm_prot_bits(prot, new_vma_pkey);
@@ -789,7 +793,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = find_vma(current->mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
diff --git a/mm/mremap.c b/mm/mremap.c
index b522cd0259a0..fe587c5d6591 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,6 +9,7 @@
*/
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/ksm.h>
@@ -23,6 +24,7 @@
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
@@ -716,7 +718,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (excess) {
vma->vm_flags |= VM_ACCOUNT;
if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
+ find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
}
return new_addr;
@@ -866,9 +868,10 @@ out:
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
unsigned long end = vma->vm_end + delta;
+
if (end < vma->vm_end) /* overflow */
return 0;
- if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
return 0;
if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
0, MAP_FIXED) & ~PAGE_MASK)
@@ -975,20 +978,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * __do_munmap does all the needed commit accounting, and
+ * do_mas_munmap does all the needed commit accounting, and
* downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
+ MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len);
- retval = __do_munmap(mm, addr+new_len, old_len - new_len,
- &uf_unmap, true);
- if (retval < 0 && old_len != new_len) {
- ret = retval;
- goto out;
+ retval = do_mas_munmap(&mas, mm, addr + new_len,
+ old_len - new_len, &uf_unmap, true);
/* Returning 1 indicates mmap_lock is downgraded to read. */
- } else if (retval == 1)
+ if (retval == 1) {
downgraded = true;
+ } else if (retval < 0 && old_len != new_len) {
+ ret = retval;
+ goto out;
+ }
+
ret = addr;
goto out;
}
@@ -1008,6 +1014,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
long pages = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long extension_start = addr + old_len;
+ unsigned long extension_end = addr + new_len;
+ pgoff_t extension_pgoff = vma->vm_pgoff +
+ ((extension_start - vma->vm_start) >> PAGE_SHIFT);
if (vma->vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, pages)) {
@@ -1016,8 +1026,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
- if (vma_adjust(vma, vma->vm_start, addr + new_len,
- vma->vm_pgoff, NULL)) {
+ /*
+ * Function vma_merge() is called on the extension we are adding to
+ * the already existing vma, vma_merge() will merge this extension with
+ * the already existing vma (expand operation itself) and possibly also
+ * with the next vma if it becomes adjacent to the expanded vma and
+ * otherwise compatible.
+ */
+ vma = vma_merge(mm, vma, extension_start, extension_end,
+ vma->vm_flags, vma->anon_vma, vma->vm_file,
+ extension_pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
diff --git a/mm/msync.c b/mm/msync.c
index 137d1c104f3e..ac4c9bfea2e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
}
}
out_unlock:
diff --git a/mm/nommu.c b/mm/nommu.c
index e819cbc21b39..214c70e1d059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -19,7 +19,6 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
-#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -545,26 +544,27 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region);
}
-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- struct vm_area_struct *pvma, *prev;
- struct address_space *mapping;
- struct rb_node **p, *parent, *rb_prev;
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
+}
- BUG_ON(!vma->vm_region);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
+{
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
+}
+static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
+{
mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
@@ -572,67 +572,51 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
- /* add the VMA to the tree */
- parent = rb_prev = NULL;
- p = &mm->mm_rb.rb_node;
- while (*p) {
- parent = *p;
- pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-
- /* sort by: start addr, end addr, VMA struct addr in that order
- * (the latter is necessary as we may get identical VMAs) */
- if (vma->vm_start < pvma->vm_start)
- p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma->vm_end < pvma->vm_end)
- p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else
- BUG();
- }
-
- rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+/*
+ * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm().
+ * @mas: The maple state with preallocations.
+ * @mm: The mm_struct
+ * @vma: The vma to add
+ *
+ */
+static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ BUG_ON(!vma->vm_region);
- /* add VMA to the VMA list also */
- prev = NULL;
- if (rb_prev)
- prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ setup_vma_to_mm(vma, mm);
- __vma_link_list(mm, vma, prev);
+ /* add the VMA to the tree */
+ vma_mas_store(vma, mas);
}
/*
- * delete a VMA from its owning mm_struct and address space
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_lock held writelocked
*/
-static void delete_vma_from_mm(struct vm_area_struct *vma)
-{
- int i;
- struct address_space *mapping;
- struct mm_struct *mm = vma->vm_mm;
- struct task_struct *curr = current;
-
- mm->map_count--;
- for (i = 0; i < VMACACHE_SIZE; i++) {
- /* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache.vmas[i] == vma) {
- vmacache_invalidate(mm);
- break;
- }
+static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
}
+ mas_add_vma_to_mm(&mas, mm, vma);
+ return 0;
+}
+static void cleanup_vma_from_mm(struct vm_area_struct *vma)
+{
+ vma->vm_mm->map_count--;
/* remove the VMA from the mapping */
if (vma->vm_file) {
+ struct address_space *mapping;
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
@@ -641,11 +625,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static int delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
- /* remove from the MM's tree and list */
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ cleanup_vma_from_mm(vma);
- __vma_unlink_list(mm, vma);
+ /* remove from the MM's tree and list */
+ vma_mas_remove(vma, &mas);
+ return 0;
}
/*
@@ -661,31 +658,26 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vm_area_free(vma);
}
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
-
- return NULL;
+ return mas_walk(&mas);
}
EXPORT_SYMBOL(find_vma);
@@ -717,26 +709,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find_exact(mm, addr, end);
- if (vma)
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start < addr)
- continue;
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end == end) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = mas_walk(&mas);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start != addr)
+ return NULL;
+ if (vma->vm_end != end)
+ return NULL;
- return NULL;
+ return vma;
}
/*
@@ -1069,6 +1052,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
*populate = 0;
@@ -1087,6 +1071,7 @@ unsigned long do_mmap(struct file *file,
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
if (!region)
@@ -1096,6 +1081,9 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ goto error_maple_preallocate;
+
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1236,7 +1224,7 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;
share:
- add_vma_to_mm(current->mm, vma);
+ mas_add_vma_to_mm(&mas, current->mm, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1262,6 +1250,7 @@ error:
sharing_violation:
up_write(&nommu_region_sem);
+ mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
@@ -1278,6 +1267,14 @@ error_getting_region:
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
+
+error_maple_preallocate:
+ kmem_cache_free(vm_region_jar, region);
+ vm_area_free(vma);
+ pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1343,6 +1340,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
@@ -1357,9 +1355,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return -ENOMEM;
new = vm_area_dup(vma);
- if (!new) {
- kmem_cache_free(vm_region_jar, region);
- return -ENOMEM;
+ if (!new)
+ goto err_vma_dup;
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ goto err_mas_preallocate;
}
/* most fields are the same, copy all, and then fixup */
@@ -1378,7 +1380,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- delete_vma_from_mm(vma);
down_write(&nommu_region_sem);
delete_nommu_region(vma->vm_region);
if (new_below) {
@@ -1391,9 +1392,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
add_nommu_region(vma->vm_region);
add_nommu_region(new->vm_region);
up_write(&nommu_region_sem);
- add_vma_to_mm(mm, vma);
- add_vma_to_mm(mm, new);
+
+ setup_vma_to_mm(vma, mm);
+ setup_vma_to_mm(new, mm);
+ mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
+ mas_store(&mas, vma);
+ vma_mas_store(new, &mas);
return 0;
+
+err_mas_preallocate:
+ vm_area_free(new);
+err_vma_dup:
+ kmem_cache_free(vm_region_jar, region);
+ return -ENOMEM;
}
/*
@@ -1408,12 +1419,14 @@ static int shrink_vma(struct mm_struct *mm,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ return -ENOMEM;
if (from > vma->vm_start)
vma->vm_end = from;
else
vma->vm_start = to;
- add_vma_to_mm(mm, vma);
+ if (add_vma_to_mm(mm, vma))
+ return -ENOMEM;
/* cut the backing region down to size */
region = vma->vm_region;
@@ -1441,9 +1454,10 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *vma;
unsigned long end;
- int ret;
+ int ret = 0;
len = PAGE_ALIGN(len);
if (len == 0)
@@ -1452,7 +1466,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;
/* find the first potentially overlapping VMA */
- vma = find_vma(mm, start);
+ vma = mas_find(&mas, end - 1);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1471,7 +1485,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = vma->vm_next;
+ vma = mas_next(&mas, end - 1);
} while (vma);
return -EINVAL;
} else {
@@ -1493,9 +1507,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
}
erase_whole_vma:
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ ret = -ENOMEM;
delete_vma(mm, vma);
- return 0;
+ return ret;
}
int vm_munmap(unsigned long addr, size_t len)
@@ -1520,6 +1535,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
*/
void exit_mmap(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
if (!mm)
@@ -1527,12 +1543,18 @@ void exit_mmap(struct mm_struct *mm)
mm->total_vm = 0;
- while ((vma = mm->mmap)) {
- mm->mmap = vma->vm_next;
- delete_vma_from_mm(vma);
+ /*
+ * Lock the mm to avoid assert complaining even though this is the only
+ * user of the mm
+ */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
+ __mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
}
int vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3c6cf9e3cd66..1276e49b31b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -461,7 +461,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
if (is_memcg_oom(oc))
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
- show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
+ __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
if (should_dump_unreclaim_slab())
dump_unreclaimable_slab();
}
@@ -509,10 +509,11 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-bool __oom_reap_task_mm(struct mm_struct *mm)
+static bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
+ VMA_ITERATOR(vmi, mm, 0);
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
*/
set_bit(MMF_UNSTABLE, &mm->flags);
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
@@ -764,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk)
return;
/* oom_mm is bound to the signal struct life time. */
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
mmgrab(tsk->signal->oom_mm);
- set_bit(MMF_OOM_VICTIM, &mm->flags);
- }
/*
* Make sure that the task is woken up from uninterruptible sleep
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 032a7bf8d259..ad608ef2a243 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -13,6 +13,7 @@
*/
#include <linux/kernel.h>
+#include <linux/math64.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
@@ -197,7 +198,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
min *= this_bw;
min = div64_ul(min, tot_bw);
}
- if (max < 100) {
+ if (max < 100 * BDI_RATIO_SCALE) {
max *= this_bw;
max = div64_ul(max, tot_bw);
}
@@ -650,11 +651,49 @@ void wb_domain_exit(struct wb_domain *dom)
*/
static unsigned int bdi_min_ratio;
-int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+static int bdi_check_pages_limit(unsigned long pages)
+{
+ unsigned long max_dirty_pages = global_dirtyable_memory();
+
+ if (pages > max_dirty_pages)
+ return -EINVAL;
+
+ return 0;
+}
+
+static unsigned long bdi_ratio_from_pages(unsigned long pages)
+{
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long ratio;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);
+
+ return ratio;
+}
+
+static u64 bdi_get_bytes(unsigned int ratio)
+{
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ u64 bytes;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100;
+
+ return bytes;
+}
+
+static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
unsigned int delta;
int ret = 0;
+ if (min_ratio > 100 * BDI_RATIO_SCALE)
+ return -EINVAL;
+ min_ratio *= BDI_RATIO_SCALE;
+
spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
@@ -665,7 +704,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
bdi->min_ratio = min_ratio;
} else {
delta = min_ratio - bdi->min_ratio;
- if (bdi_min_ratio + delta < 100) {
+ if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) {
bdi_min_ratio += delta;
bdi->min_ratio = min_ratio;
} else {
@@ -678,11 +717,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
return ret;
}
-int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
int ret = 0;
- if (max_ratio > 100)
+ if (max_ratio > 100 * BDI_RATIO_SCALE)
return -EINVAL;
spin_lock_bh(&bdi_lock);
@@ -696,8 +735,81 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
return ret;
}
+
+int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ return __bdi_set_min_ratio(bdi, min_ratio);
+}
+
+int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio)
+{
+ return __bdi_set_max_ratio(bdi, max_ratio);
+}
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE);
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
+{
+ return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE);
+}
EXPORT_SYMBOL(bdi_set_max_ratio);
+u64 bdi_get_min_bytes(struct backing_dev_info *bdi)
+{
+ return bdi_get_bytes(bdi->min_ratio);
+}
+
+int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
+{
+ int ret;
+ unsigned long pages = min_bytes >> PAGE_SHIFT;
+ unsigned long min_ratio;
+
+ ret = bdi_check_pages_limit(pages);
+ if (ret)
+ return ret;
+
+ min_ratio = bdi_ratio_from_pages(pages);
+ return __bdi_set_min_ratio(bdi, min_ratio);
+}
+
+u64 bdi_get_max_bytes(struct backing_dev_info *bdi)
+{
+ return bdi_get_bytes(bdi->max_ratio);
+}
+
+int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
+{
+ int ret;
+ unsigned long pages = max_bytes >> PAGE_SHIFT;
+ unsigned long max_ratio;
+
+ ret = bdi_check_pages_limit(pages);
+ if (ret)
+ return ret;
+
+ max_ratio = bdi_ratio_from_pages(pages);
+ return __bdi_set_max_ratio(bdi, max_ratio);
+}
+
+int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
+{
+ if (strict_limit > 1)
+ return -EINVAL;
+
+ spin_lock_bh(&bdi_lock);
+ if (strict_limit)
+ bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+ else
+ bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
+ spin_unlock_bh(&bdi_lock);
+
+ return 0;
+}
+
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
@@ -760,15 +872,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
&numerator, &denominator);
- wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+ wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE);
wb_thresh *= numerator;
wb_thresh = div64_ul(wb_thresh, denominator);
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
- wb_thresh += (thresh * wb_min_ratio) / 100;
- if (wb_thresh > (thresh * wb_max_ratio) / 100)
- wb_thresh = thresh * wb_max_ratio / 100;
+ wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE))
+ wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
return wb_thresh;
}
@@ -1933,6 +2045,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
wb_put(wb);
return ret;
}
+EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);
/**
* balance_dirty_pages_ratelimited - balance dirty memory state.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5486d47406e..0745aedebb37 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
+#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
@@ -169,21 +170,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
_ret; \
})
-#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \
+#define pcpu_spin_trylock(type, member, ptr) \
({ \
type *_ret; \
pcpu_task_pin(); \
_ret = this_cpu_ptr(ptr); \
- spin_lock_irqsave(&_ret->member, flags); \
- _ret; \
-})
-
-#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \
-({ \
- type *_ret; \
- pcpu_task_pin(); \
- _ret = this_cpu_ptr(ptr); \
- if (!spin_trylock_irqsave(&_ret->member, flags)) { \
+ if (!spin_trylock(&_ret->member)) { \
pcpu_task_unpin(); \
_ret = NULL; \
} \
@@ -196,27 +188,16 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
pcpu_task_unpin(); \
})
-#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \
-({ \
- spin_unlock_irqrestore(&ptr->member, flags); \
- pcpu_task_unpin(); \
-})
-
/* struct per_cpu_pages specific helpers. */
#define pcp_spin_lock(ptr) \
pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
-#define pcp_spin_lock_irqsave(ptr, flags) \
- pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags)
-
-#define pcp_spin_trylock_irqsave(ptr, flags) \
- pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+#define pcp_spin_trylock(ptr) \
+ pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
#define pcp_spin_unlock(ptr) \
pcpu_spin_unlock(lock, ptr)
-#define pcp_spin_unlock_irqrestore(ptr, flags) \
- pcpu_spin_unlock_irqrestore(lock, ptr, flags)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -482,6 +463,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
static unsigned long prev_end_pfn, nr_initialised;
+ if (early_page_ext_enabled())
+ return false;
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
@@ -542,7 +525,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
- pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+ pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
@@ -795,6 +778,7 @@ static void prep_compound_head(struct page *page, unsigned int order)
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
+ atomic_set(subpages_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
}
@@ -804,6 +788,7 @@ static void prep_compound_tail(struct page *head, int tail_idx)
p->mapping = TAIL_MAPPING;
set_compound_head(p, head);
+ set_page_private(p, 0);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -870,7 +855,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
INIT_LIST_HEAD(&page->buddy_list);
set_page_private(page, order);
/* Guard pages are not available for any usage */
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
return true;
}
@@ -900,7 +886,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
* order of appearance. So we need to first gather the full picture of what was
* enabled, and then make decisions.
*/
-void init_mem_debugging_and_hardening(void)
+void __init init_mem_debugging_and_hardening(void)
{
bool page_poisoning_requested = false;
@@ -935,6 +921,10 @@ void init_mem_debugging_and_hardening(void)
else
static_branch_disable(&init_on_free);
+ if (IS_ENABLED(CONFIG_KMSAN) &&
+ (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
+ pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -1105,7 +1095,7 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
- unsigned long buddy_pfn;
+ unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
struct page *buddy;
bool to_tail;
@@ -1283,20 +1273,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
return bad_reason;
}
-static void check_free_page_bad(struct page *page)
+static void free_page_is_bad_report(struct page *page)
{
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
-static inline int check_free_page(struct page *page)
+static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
- return 0;
+ return false;
/* Something has gone sideways, find it */
- check_free_page_bad(page);
- return 1;
+ free_page_is_bad_report(page);
+ return true;
}
static int free_tail_pages_check(struct page *head_page, struct page *page)
@@ -1315,11 +1305,19 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
}
switch (page - head_page) {
case 1:
- /* the first tail page: ->mapping may be compound_mapcount() */
- if (unlikely(compound_mapcount(page))) {
+ /* the first tail page: these may be in place of ->mapping */
+ if (unlikely(head_compound_mapcount(head_page))) {
bad_page(page, "nonzero compound_mapcount");
goto out;
}
+ if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) {
+ bad_page(page, "nonzero subpages_mapcount");
+ goto out;
+ }
+ if (unlikely(head_compound_pincount(head_page))) {
+ bad_page(page, "nonzero compound_pincount");
+ goto out;
+ }
break;
case 2:
/*
@@ -1398,6 +1396,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
+ kmsan_free_page(page, order);
if (unlikely(PageHWPoison(page)) && !order) {
/*
@@ -1421,14 +1420,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
- if (compound) {
- ClearPageDoubleMap(page);
+ if (compound)
ClearPageHasHWPoisoned(page);
- }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
- if (unlikely(check_free_page(page + i))) {
+ if (unlikely(free_page_is_bad(page + i))) {
bad++;
continue;
}
@@ -1439,8 +1436,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
page->mapping = NULL;
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
- if (check_free)
- bad += check_free_page(page);
+ if (check_free && free_page_is_bad(page))
+ bad++;
if (bad)
return false;
@@ -1499,10 +1496,11 @@ static bool free_pcp_prepare(struct page *page, unsigned int order)
return free_pages_prepare(page, order, true, FPI_NONE);
}
+/* return true if this page has an inappropriate state */
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return check_free_page(page);
+ return free_page_is_bad(page);
else
return false;
}
@@ -1523,7 +1521,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order)
static bool bulkfree_pcp_prepare(struct page *page)
{
- return check_free_page(page);
+ return free_page_is_bad(page);
}
#endif /* CONFIG_DEBUG_VM */
@@ -1536,6 +1534,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp,
int pindex)
{
+ unsigned long flags;
int min_pindex = 0;
int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
@@ -1551,8 +1550,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
- spin_lock(&zone->lock);
+ spin_lock_irqsave(&zone->lock, flags);
isolated_pageblocks = has_isolate_pageblock(zone);
while (count > 0) {
@@ -1575,7 +1573,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
order = pindex_to_order(pindex);
nr_pages = 1 << order;
- BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
int mt;
@@ -1601,7 +1598,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
} while (count > 0 && !list_empty(list));
}
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void free_one_page(struct zone *zone,
@@ -1705,6 +1702,11 @@ static void __free_pages_ok(struct page *page, unsigned int order,
if (!free_pages_prepare(page, order, true, fpi_flags))
return;
+ /*
+ * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
+ * is used to avoid calling get_pfnblock_migratetype() under the lock.
+ * This will reduce the lock holding time.
+ */
migratetype = get_pfnblock_migratetype(page, pfn);
spin_lock_irqsave(&zone->lock, flags);
@@ -1804,6 +1806,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
+ if (!kmsan_memblock_free_pages(page, order)) {
+ /* KMSAN will take care of these pages. */
+ return;
+ }
__free_pages_core(page, order);
}
@@ -1855,7 +1861,7 @@ void set_zone_contiguous(struct zone *zone)
unsigned long block_start_pfn = zone->zone_start_pfn;
unsigned long block_end_pfn;
- block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
for (; block_start_pfn < zone_end_pfn(zone);
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1890,15 +1896,14 @@ static void __init deferred_free_range(unsigned long pfn,
page = pfn_to_page(pfn);
/* Free a large naturally-aligned chunk if possible */
- if (nr_pages == pageblock_nr_pages &&
- (pfn & (pageblock_nr_pages - 1)) == 0) {
+ if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
- if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, 0);
}
@@ -1917,16 +1922,12 @@ static inline void __init pgdat_init_report_one_done(void)
/*
* Returns true if page needs to be initialized or freed to buddy allocator.
*
- * First we check if pfn is valid on architectures where it is possible to have
- * holes within pageblock_nr_pages. On systems where it is not possible, this
- * function is optimized out.
- *
- * Then, we check if a current large page is valid by only checking the validity
+ * We check if a current large page is valid by only checking the validity
* of the head pfn.
*/
static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
- if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
+ if (pageblock_aligned(pfn) && !pfn_valid(pfn))
return false;
return true;
}
@@ -1938,14 +1939,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn)
static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
- } else if (!(pfn & nr_pgmask)) {
+ } else if (pageblock_aligned(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
} else {
@@ -1965,7 +1965,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
int zid = zone_idx(zone);
@@ -1975,7 +1974,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
- } else if (!page || !(pfn & nr_pgmask)) {
+ } else if (!page || pageblock_aligned(pfn)) {
page = pfn_to_page(pfn);
} else {
page++;
@@ -2651,8 +2650,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
*num_movable = 0;
pfn = page_to_pfn(page);
- start_pfn = pfn & ~(pageblock_nr_pages - 1);
- end_pfn = start_pfn + pageblock_nr_pages - 1;
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = pageblock_end_pfn(pfn) - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
@@ -3010,7 +3009,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
- if (alloc_flags & ALLOC_NOFRAGMENT)
+ if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
@@ -3118,10 +3117,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ unsigned long flags;
int i, allocated = 0;
- /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
- spin_lock(&zone->lock);
+ spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
@@ -3155,7 +3154,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
return allocated;
}
@@ -3172,16 +3171,9 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
- unsigned long flags;
-
- /*
- * free_pcppages_bulk expects IRQs disabled for zone->lock
- * so even though pcp->lock is not intended to be IRQ-safe,
- * it's needed in this context.
- */
- spin_lock_irqsave(&pcp->lock, flags);
+ spin_lock(&pcp->lock);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- spin_unlock_irqrestore(&pcp->lock, flags);
+ spin_unlock(&pcp->lock);
}
}
#endif
@@ -3195,12 +3187,9 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count) {
- unsigned long flags;
-
- /* See drain_zone_pages on why this is disabling IRQs */
- spin_lock_irqsave(&pcp->lock, flags);
+ spin_lock(&pcp->lock);
free_pcppages_bulk(zone, pcp->count, pcp, 0);
- spin_unlock_irqrestore(&pcp->lock, flags);
+ spin_unlock(&pcp->lock);
}
}
@@ -3440,7 +3429,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
int pindex;
bool free_high;
- __count_vm_event(PGFREE);
+ __count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
@@ -3466,7 +3455,6 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
*/
void free_unref_page(struct page *page, unsigned int order)
{
- unsigned long flags;
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
@@ -3494,10 +3482,10 @@ void free_unref_page(struct page *page, unsigned int order)
zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
free_unref_page_commit(zone, pcp, page, migratetype, order);
- pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
}
@@ -3509,10 +3497,10 @@ void free_unref_page(struct page *page, unsigned int order)
*/
void free_unref_page_list(struct list_head *list)
{
+ unsigned long __maybe_unused UP_flags;
struct page *page, *next;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
- unsigned long flags;
int batch_count = 0;
int migratetype;
@@ -3539,39 +3527,54 @@ void free_unref_page_list(struct list_head *list)
list_for_each_entry_safe(page, next, list, lru) {
struct zone *zone = page_zone(page);
- /* Different zone, different pcp lock. */
- if (zone != locked_zone) {
- if (pcp)
- pcp_spin_unlock_irqrestore(pcp, flags);
+ list_del(&page->lru);
+ migratetype = get_pcppage_migratetype(page);
+
+ /*
+ * Either different zone requiring a different pcp lock or
+ * excessive lock hold times when freeing a large list of
+ * pages.
+ */
+ if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
+ if (pcp) {
+ pcp_spin_unlock(pcp);
+ pcp_trylock_finish(UP_flags);
+ }
+ batch_count = 0;
+
+ /*
+ * trylock is necessary as pages may be getting freed
+ * from IRQ or SoftIRQ context after an IO completion.
+ */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ if (unlikely(!pcp)) {
+ pcp_trylock_finish(UP_flags);
+ free_one_page(zone, page, page_to_pfn(page),
+ 0, migratetype, FPI_NONE);
+ locked_zone = NULL;
+ continue;
+ }
locked_zone = zone;
- pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
}
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
*/
- migratetype = get_pcppage_migratetype(page);
if (unlikely(migratetype >= MIGRATE_PCPTYPES))
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
free_unref_page_commit(zone, pcp, page, migratetype, 0);
-
- /*
- * Guard against excessive IRQ disabled times when we get
- * a large list of pages to free.
- */
- if (++batch_count == SWAP_CLUSTER_MAX) {
- pcp_spin_unlock_irqrestore(pcp, flags);
- batch_count = 0;
- pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
- }
+ batch_count++;
}
- if (pcp)
- pcp_spin_unlock_irqrestore(pcp, flags);
+ if (pcp) {
+ pcp_spin_unlock(pcp);
+ pcp_trylock_finish(UP_flags);
+ }
}
/*
@@ -3598,16 +3601,11 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
- unsigned long watermark;
- struct zone *zone;
- int mt;
-
- BUG_ON(!PageBuddy(page));
-
- zone = page_zone(page);
- mt = get_pageblock_migratetype(page);
+ struct zone *zone = page_zone(page);
+ int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
+ unsigned long watermark;
/*
* Obey watermarks as if the page was being allocated. We can
* emulate a high-order watermark check with a raised order-0
@@ -3621,8 +3619,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
- /* Remove page from free list */
-
del_page_from_free_list(page, zone, order);
/*
@@ -3643,7 +3639,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
-
return 1UL << order;
}
@@ -3670,8 +3665,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
/*
* Update NUMA hit/miss statistics
- *
- * Must be called with interrupts disabled.
*/
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
long nr_account)
@@ -3777,21 +3770,16 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
- unsigned long flags;
unsigned long __maybe_unused UP_flags;
- /*
- * spin_trylock may fail due to a parallel drain. In the future, the
- * trylock will also protect against IRQ reentrancy.
- */
+ /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp) {
pcp_trylock_finish(UP_flags);
return NULL;
@@ -3805,18 +3793,27 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp->free_factor >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
}
/*
- * Allocate a page from the given zone. Use pcplists for order-0 allocations.
+ * Allocate a page from the given zone.
+ * Use pcplists for THP or "cheap" high-order allocations.
*/
+
+/*
+ * Do not instrument rmqueue() with KMSAN. This function may call
+ * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
+ * may call rmqueue() again, which will result in a deadlock.
+ */
+__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
@@ -3839,7 +3836,7 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ migratetype, alloc_flags);
if (likely(page))
goto out;
}
@@ -3882,6 +3879,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc);
static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
+ int flags = 0;
+
if (order < fail_page_alloc.min_order)
return false;
if (gfp_mask & __GFP_NOFAIL)
@@ -3892,10 +3891,11 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
+ /* See comment in __should_failslab() */
if (gfp_mask & __GFP_NOWARN)
- fail_page_alloc.attr.no_warn = true;
+ flags |= FAULT_NOWARN;
- return should_fail(&fail_page_alloc.attr, 1 << order);
+ return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -4329,7 +4329,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
- show_mem(filter, nodemask);
+ __show_mem(filter, nodemask, gfp_zone(gfp_mask));
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -5121,7 +5147,8 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
+ (alloc_flags & ALLOC_KSWAPD);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -5187,9 +5214,13 @@ retry:
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -5209,9 +5240,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -5238,7 +5273,7 @@ nopage:
* so that we can identify them and convert them to something
* else.
*/
- WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
+ WARN_ON_ONCE_GFP(costly_order, gfp_mask);
/*
* Help non-failing allocations by giving them access to memory
@@ -5329,7 +5364,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
struct page **page_array)
{
struct page *page;
- unsigned long flags;
unsigned long __maybe_unused UP_flags;
struct zone *zone;
struct zoneref *z;
@@ -5411,9 +5445,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
if (unlikely(!zone))
goto failed;
- /* Is a parallel drain in progress? */
+ /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp)
goto failed_irq;
@@ -5432,7 +5466,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
if (unlikely(!page)) {
/* Try and allocate at least one page */
if (!nr_account) {
- pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_spin_unlock(pcp);
goto failed_irq;
}
break;
@@ -5447,7 +5481,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nr_populated++;
}
- pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
@@ -5535,6 +5569,7 @@ out:
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
@@ -5706,6 +5741,18 @@ refill:
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
@@ -5732,14 +5779,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
+ unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
+ struct page *page = virt_to_page((void *)addr);
+ struct page *last = page + nr;
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
+ while (page < --last)
+ set_page_refcounted(last);
+
+ last = page + (1UL << order);
+ for (page += nr; page < last; page++)
+ __free_pages_ok(page, 0, FPI_TO_TAIL);
}
return (void *)addr;
}
@@ -6011,6 +6062,15 @@ static void show_migration_types(unsigned char type)
printk(KERN_CONT "(%s) ", tmp);
}
+static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
+{
+ int zone_idx;
+ for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
+ if (zone_managed_pages(pgdat->node_zones + zone_idx))
+ return true;
+ return false;
+}
+
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -6020,7 +6080,7 @@ static void show_migration_types(unsigned char type)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void show_free_areas(unsigned int filter, nodemask_t *nodemask)
+void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long free_pcp = 0;
int cpu, nid;
@@ -6028,6 +6088,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
pg_data_t *pgdat;
for_each_populated_zone(zone) {
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
@@ -6039,7 +6101,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
" kernel_misc_reclaimable:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
@@ -6056,6 +6119,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
@@ -6065,6 +6129,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
continue;
+ if (!node_has_managed_zones(pgdat, max_zone_idx))
+ continue;
printk("Node %d"
" active_anon:%lukB"
@@ -6089,6 +6155,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" shadow_call_stack:%lukB"
#endif
" pagetables:%lukB"
+ " sec_pagetables:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -6114,6 +6181,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -6121,6 +6189,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
for_each_populated_zone(zone) {
int i;
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
@@ -6182,6 +6252,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
show_node(zone);
@@ -6507,16 +6579,15 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -6553,7 +6624,7 @@ static void __build_all_zonelists(void *data)
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
@@ -6704,7 +6775,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
* such that unmovable allocations won't be scattered all
* over the place during system boot.
*/
- if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ if (pageblock_aligned(pfn)) {
set_pageblock_migratetype(page, migratetype);
cond_resched();
}
@@ -6747,10 +6818,18 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ if (pageblock_aligned(pfn)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+
+ /*
+ * ZONE_DEVICE pages are released directly to the driver page allocator
+ * which will set the page count to 1 when allocating the page.
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT)
+ set_page_count(page, 0);
}
/*
@@ -6786,13 +6865,11 @@ static void __ref memmap_init_compound(struct page *head,
set_page_count(page, 0);
/*
- * The first tail page stores compound_mapcount_ptr() and
- * compound_order() and the second tail page stores
- * compound_pincount_ptr(). Call prep_compound_head() after
- * the first and second tail pages have been initialized to
- * not have the data overwritten.
+ * The first tail page stores important compound page info.
+ * Call prep_compound_head() after the first tail page has
+ * been initialized, to not have the data overwritten.
*/
- if (pfn == head_pfn + 2)
+ if (pfn == head_pfn + 1)
prep_compound_head(head, order);
}
}
@@ -6810,7 +6887,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start = jiffies;
int nid = pgdat->node_id;
- if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
+ if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
return;
/*
@@ -6879,9 +6956,8 @@ static void __init init_unavailable_range(unsigned long spfn,
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
+ if (!pfn_valid(pageblock_start_pfn(pfn))) {
+ pfn = pageblock_end_pfn(pfn) - 1;
continue;
}
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
@@ -6986,7 +7062,7 @@ static int zone_batchsize(struct zone *zone)
* size is striking a balance between allocation latency
* and zone lock contention.
*/
- batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
+ batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -7171,6 +7247,17 @@ void __meminit setup_zone_pageset(struct zone *zone)
}
/*
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalculated.
+ */
+static void zone_pcp_update(struct zone *zone, int cpu_online)
+{
+ mutex_lock(&pcp_batch_high_lock);
+ zone_set_pageset_high_and_batch(zone, cpu_online);
+ mutex_unlock(&pcp_batch_high_lock);
+}
+
+/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
*/
@@ -7614,6 +7701,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
int i;
pgdat_resize_init(pgdat);
+ pgdat_kswapd_lock_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
@@ -7908,17 +7996,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
}
-/**
- * find_min_pfn_with_active_regions - Find the minimum PFN registered
- *
- * Return: the minimum PFN based on information provided via
- * memblock_set_node().
- */
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
- return PHYS_PFN(memblock_start_of_DRAM());
-}
-
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
@@ -8211,7 +8288,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
- start_pfn = find_min_pfn_with_active_regions();
+ start_pfn = PHYS_PFN(memblock_start_of_DRAM());
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -8460,8 +8537,8 @@ void __init mem_init_print_info(void)
#endif
")\n",
K(nr_free_pages()), K(physpages),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
+ (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
K(physpages - totalram_pages() - totalcma_pages),
K(totalcma_pages)
#ifdef CONFIG_HIGHMEM
@@ -8974,7 +9051,7 @@ void *__init alloc_large_system_hash(const char *tablename,
{
unsigned long long max = high_limit;
unsigned long log2qty, size;
- void *table = NULL;
+ void *table;
gfp_t gfp_flags;
bool virt;
bool huge;
@@ -8986,8 +9063,8 @@ void *__init alloc_large_system_hash(const char *tablename,
numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
- if (PAGE_SHIFT < 20)
- numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+ if (PAGE_SIZE < SZ_1M)
+ numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
#if __BITS_PER_LONG > 32
if (!high_limit) {
@@ -9412,17 +9489,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
EXPORT_SYMBOL(free_contig_range);
/*
- * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalculated.
- */
-void zone_pcp_update(struct zone *zone, int cpu_online)
-{
- mutex_lock(&pcp_batch_high_lock);
- zone_set_pageset_high_and_batch(zone, cpu_online);
- mutex_unlock(&pcp_batch_high_lock);
-}
-
-/*
* Effectively disable pcplists for the zone by setting the high limit to 0
* and draining all cpus. A concurrent page freeing on another CPU that's about
* to put the page on pcplist will either finish before the drain and the page
@@ -9454,9 +9520,11 @@ void zone_pcp_reset(struct zone *zone)
drain_zonestat(zone, pzstats);
}
free_percpu(zone->per_cpu_pageset);
- free_percpu(zone->per_cpu_zonestats);
zone->per_cpu_pageset = &boot_pageset;
- zone->per_cpu_zonestats = &boot_zonestats;
+ if (zone->per_cpu_zonestats != &boot_zonestats) {
+ free_percpu(zone->per_cpu_zonestats);
+ zone->per_cpu_zonestats = &boot_zonestats;
+ }
}
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index eb156ff5d603..db20d6452b71 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
- unsigned long low, min;
long delta;
if (!c->parent)
return;
- min = READ_ONCE(c->min);
- if (min || atomic_long_read(&c->min_usage)) {
- protected = min(usage, min);
+ protected = min(usage, READ_ONCE(c->min));
+ old_protected = atomic_long_read(&c->min_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->min_usage, protected);
delta = protected - old_protected;
if (delta)
atomic_long_add(delta, &c->parent->children_min_usage);
}
- low = READ_ONCE(c->low);
- if (low || atomic_long_read(&c->low_usage)) {
- protected = min(usage, low);
+ protected = min(usage, READ_ONCE(c->low));
+ old_protected = atomic_long_read(&c->low_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->low_usage, protected);
delta = protected - old_protected;
if (delta)
@@ -193,7 +192,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
old = xchg(&counter->max, nr_pages);
- if (page_counter_read(counter) <= usage)
+ if (page_counter_read(counter) <= usage || nr_pages >= old)
return 0;
counter->max = old;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 3dc715d7ac29..4ee522fd381c 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -9,6 +9,7 @@
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
+#include <linux/rcupdate.h>
/*
* struct page extension
@@ -59,6 +60,10 @@
* can utilize this callback to initialize the state of it correctly.
*/
+#ifdef CONFIG_SPARSEMEM
+#define PAGE_EXT_INVALID (0x1)
+#endif
+
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
static bool need_page_idle(void)
{
@@ -84,6 +89,15 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
unsigned long page_ext_size = sizeof(struct page_ext);
static unsigned long total_usage;
+static struct page_ext *lookup_page_ext(const struct page *page);
+
+bool early_page_ext;
+static int __init setup_early_page_ext(char *str)
+{
+ early_page_ext = true;
+ return 0;
+}
+early_param("early_page_ext", setup_early_page_ext);
static bool __init invoke_need_callbacks(void)
{
@@ -125,6 +139,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index)
return base + page_ext_size * index;
}
+/**
+ * page_ext_get() - Get the extended information for a page.
+ * @page: The page we're interested in.
+ *
+ * Ensures that the page_ext will remain valid until page_ext_put()
+ * is called.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ * Context: Any context. Caller may not sleep until they have called
+ * page_ext_put().
+ */
+struct page_ext *page_ext_get(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ rcu_read_lock();
+ page_ext = lookup_page_ext(page);
+ if (!page_ext) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ return page_ext;
+}
+
+/**
+ * page_ext_put() - Working with page extended information is done.
+ * @page_ext: Page extended information received from page_ext_get().
+ *
+ * The page extended information of the page may not be valid after this
+ * function is called.
+ *
+ * Return: None.
+ * Context: Any context with corresponding page_ext_get() is called.
+ */
+void page_ext_put(struct page_ext *page_ext)
+{
+ if (unlikely(!page_ext))
+ return;
+
+ rcu_read_unlock();
+}
#ifndef CONFIG_SPARSEMEM
@@ -133,12 +189,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
struct page_ext *base;
+ WARN_ON_ONCE(!rcu_read_lock_held());
base = NODE_DATA(page_to_nid(page))->node_page_ext;
/*
* The sanity checks the page allocator does upon freeing a
@@ -206,20 +263,27 @@ fail:
}
#else /* CONFIG_SPARSEMEM */
+static bool page_ext_invalid(struct page_ext *page_ext)
+{
+ return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
+}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
+ struct page_ext *page_ext = READ_ONCE(section->page_ext);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
*/
- if (!section->page_ext)
+ if (page_ext_invalid(page_ext))
return NULL;
- return get_entry(section->page_ext, pfn);
+ return get_entry(page_ext, pfn);
}
static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -298,9 +362,30 @@ static void __free_page_ext(unsigned long pfn)
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_ext)
return;
- base = get_entry(ms->page_ext, pfn);
+
+ base = READ_ONCE(ms->page_ext);
+ /*
+ * page_ext here can be valid while doing the roll back
+ * operation in online_page_ext().
+ */
+ if (page_ext_invalid(base))
+ base = (void *)base - PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, NULL);
+
+ base = get_entry(base, pfn);
free_page_ext(base);
- ms->page_ext = NULL;
+}
+
+static void __invalidate_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ void *val;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ val = (void *)ms->page_ext + PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, val);
}
static int __meminit online_page_ext(unsigned long start_pfn,
@@ -336,13 +421,27 @@ static int __meminit online_page_ext(unsigned long start_pfn,
}
static int __meminit offline_page_ext(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
+ unsigned long nr_pages)
{
unsigned long start, end, pfn;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+ /*
+ * Freeing of page_ext is done in 3 steps to avoid
+ * use-after-free of it:
+ * 1) Traverse all the sections and mark their page_ext
+ * as invalid.
+ * 2) Wait for all the existing users of page_ext who
+ * started before invalidation to finish.
+ * 3) Free the page_ext.
+ */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __invalidate_page_ext(pfn);
+
+ synchronize_rcu();
+
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
return 0;
@@ -362,11 +461,11 @@ static int __meminit page_ext_callback(struct notifier_block *self,
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_CANCEL_ONLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_GOING_OFFLINE:
break;
@@ -414,7 +513,7 @@ void __init page_ext_init(void)
cond_resched();
}
}
- hotplug_memory_notifier(page_ext_callback, 0);
+ hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
pr_info("allocated %ld bytes of page_ext\n", total_usage);
invoke_init_callbacks();
return;
diff --git a/mm/page_io.c b/mm/page_io.c
index 68318134dc92..3a5f921b932e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,7 +28,7 @@
#include <linux/delayacct.h>
#include "swap.h"
-void end_swap_bio_write(struct bio *bio)
+static void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -180,29 +180,30 @@ bad_bmap:
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
- if (try_to_free_swap(page)) {
- unlock_page(page);
+ if (folio_free_swap(folio)) {
+ folio_unlock(folio);
goto out;
}
/*
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
*/
- ret = arch_prepare_to_swap(page);
+ ret = arch_prepare_to_swap(&folio->page);
if (ret) {
- set_page_dirty(page);
- unlock_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
goto out;
}
- if (frontswap_store(page) == 0) {
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
+ if (frontswap_store(&folio->page) == 0) {
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
goto out;
}
- ret = __swap_writepage(page, wbc, end_swap_bio_write);
+ ret = __swap_writepage(&folio->page, wbc);
out:
return ret;
}
@@ -332,8 +333,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
return 0;
}
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func)
+int __swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio;
int ret;
@@ -358,7 +358,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
- bio->bi_end_io = end_write_func;
+ bio->bi_end_io = end_swap_bio_write;
bio_add_page(bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(bio, page);
@@ -376,7 +376,7 @@ void swap_write_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
@@ -453,18 +453,21 @@ int swap_readpage(struct page *page, bool synchronous,
struct swap_info_struct *sis = page_swap_info(page);
bool workingset = PageWorkingset(page);
unsigned long pflags;
+ bool in_thrashing;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageUptodate(page), page);
/*
- * Count submission time as memory stall. When the device is congested,
- * or the submitting cgroup IO-throttled, submission can be a
- * significant part of overall IO time.
+ * Count submission time as memory stall and delay. When the device
+ * is congested, or the submitting cgroup IO-throttled, submission
+ * can be a significant part of overall IO time.
*/
- if (workingset)
+ if (workingset) {
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
+ }
delayacct_swapin_start();
if (frontswap_load(page) == 0) {
@@ -513,8 +516,10 @@ int swap_readpage(struct page *page, bool synchronous,
bio_put(bio);
out:
- if (workingset)
+ if (workingset) {
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
+ }
delayacct_swapin_end();
return ret;
}
@@ -525,7 +530,7 @@ void __swap_read_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_read_complete(&sio->iocb, ret);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d73dc38e3d7..47fbc1696466 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -37,8 +37,8 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
struct zone *zone = page_zone(page);
unsigned long pfn;
- VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
- ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
+ VM_BUG_ON(pageblock_start_pfn(start_pfn) !=
+ pageblock_start_pfn(end_pfn - 1));
if (is_migrate_cma_page(page)) {
/*
@@ -172,7 +172,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
* to avoid redundant checks.
*/
check_unmovable_start = max(page_to_pfn(page), start_pfn);
- check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages),
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
end_pfn);
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
@@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
+ * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -302,16 +303,16 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
+ int migratetype)
{
- unsigned char saved_mt;
unsigned long start_pfn;
unsigned long isolate_pageblock;
unsigned long pfn;
struct zone *zone;
int ret;
- VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+ VM_BUG_ON(!pageblock_aligned(boundary_pfn));
if (isolate_before)
isolate_pageblock = boundary_pfn - pageblock_nr_pages;
@@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
zone->zone_start_pfn);
- saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ if (skip_isolation) {
+ int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
- if (skip_isolation)
- VM_BUG_ON(!is_migrate_isolate(saved_mt));
- else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
- isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ VM_BUG_ON(!is_migrate_isolate(mt));
+ } else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
+ flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
return -EBUSY;
}
@@ -531,13 +532,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn;
struct page *page;
/* isolation is done at page block granularity */
- unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
- unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
int ret;
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ skip_isolation, migratetype);
if (ret)
return ret;
@@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ skip_isolation, migratetype);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
@@ -576,9 +579,8 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
{
unsigned long pfn;
struct page *page;
- unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
- unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
-
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
for (pfn = isolate_start;
pfn < isolate_end;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index e4c6f3f1695b..2d27f532df4c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
struct page_owner *page_owner;
u64 free_ts_nsec = local_clock();
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
@@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
depot_stack_handle_t handle;
+ handle = save_stack(gfp_mask);
+
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
-
- handle = save_stack(gfp_mask);
__set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ page_ext_put(page_ext);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner = get_page_owner(page_ext);
page_owner->last_migrate_reason = reason;
+ page_ext_put(page_ext);
}
void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -219,17 +223,25 @@ void __split_page_owner(struct page *page, unsigned int nr)
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- struct page_ext *old_ext = lookup_page_ext(&old->page);
- struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
+ struct page_ext *old_ext;
+ struct page_ext *new_ext;
struct page_owner *old_page_owner, *new_page_owner;
- if (unlikely(!old_ext || !new_ext))
+ old_ext = page_ext_get(&old->page);
+ if (unlikely(!old_ext))
return;
+ new_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!new_ext)) {
+ page_ext_put(old_ext);
+ return;
+ }
+
old_page_owner = get_page_owner(old_ext);
new_page_owner = get_page_owner(new_ext);
new_page_owner->order = old_page_owner->order;
@@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+ page_ext_put(new_ext);
+ page_ext_put(old_ext);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -283,7 +297,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
}
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
pageblock_mt = get_pageblock_migratetype(page);
@@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
page_mt = gfp_migratetype(page_owner->gfp_mask);
@@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
count[pageblock_mt]++;
pfn = block_end_pfn;
+ page_ext_put(page_ext);
break;
}
pfn += (1UL << page_owner->order) - 1;
+ext_put_continue:
+ page_ext_put(page_ext);
}
}
@@ -435,7 +452,7 @@ err:
void __dump_page_owner(const struct page *page)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get((void *)page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
gfp_t gfp_mask;
@@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
+ page_ext_put(page_ext);
return;
}
@@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page)
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
+ page_ext_put(page_ext);
}
static ssize_t
@@ -497,17 +516,25 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return -EINVAL;
page = NULL;
- pfn = min_low_pfn + *ppos;
-
+ if (*ppos == 0)
+ pfn = min_low_pfn;
+ else
+ pfn = *ppos;
/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
pfn++;
- drain_all_pages(NULL);
-
/* Find an allocated page */
for (; pfn < max_pfn; pfn++) {
/*
+ * This temporary page_owner is required so
+ * that we can avoid the context switches while holding
+ * the rcu lock and copying the page owner information to
+ * user through copy_to_user() or GFP_KERNEL allocations.
+ */
+ struct page_owner page_owner_tmp;
+
+ /*
* If the new page is in a new MAX_ORDER_NR_PAGES area,
* validate the area as existing, skip it if not
*/
@@ -525,7 +552,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
continue;
}
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
@@ -534,14 +561,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* because we don't hold the zone lock.
*/
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/*
* Although we do have the info about past allocation of free
* pages, it's not relevant for current memory usage.
*/
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
@@ -550,7 +577,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* would inflate the stats.
*/
if (!IS_ALIGNED(pfn, 1 << page_owner->order))
- continue;
+ goto ext_put_continue;
/*
* Access to page_ext->handle isn't synchronous so we should
@@ -558,18 +585,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
*/
handle = READ_ONCE(page_owner->handle);
if (!handle)
- continue;
+ goto ext_put_continue;
/* Record the next PFN to read in the file offset */
- *ppos = (pfn - min_low_pfn) + 1;
+ *ppos = pfn + 1;
+ page_owner_tmp = *page_owner;
+ page_ext_put(page_ext);
return print_page_owner(buf, count, pfn, page,
- page_owner, handle);
+ &page_owner_tmp, handle);
+ext_put_continue:
+ page_ext_put(page_ext);
}
return 0;
}
+static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
+{
+ switch (orig) {
+ case SEEK_SET:
+ file->f_pos = offset;
+ break;
+ case SEEK_CUR:
+ file->f_pos += offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return file->f_pos;
+}
+
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
unsigned long pfn = zone->zone_start_pfn;
@@ -589,7 +635,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
}
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
for (; pfn < block_end_pfn; pfn++) {
@@ -617,18 +663,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
/* Maybe overlapping zone */
if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/* Found early allocated page */
__set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
+ext_put_continue:
+ page_ext_put(page_ext);
}
cond_resched();
}
@@ -660,6 +708,7 @@ static void init_early_allocated_pages(void)
static const struct file_operations proc_page_owner_operations = {
.read = read_page_owner,
+ .llseek = lseek_page_owner,
};
static int __init pageowner_init(void)
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 382958eef8a9..79a8554f024c 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -11,10 +11,42 @@
#include "page_reporting.h"
#include "internal.h"
-unsigned int page_reporting_order = MAX_ORDER;
-module_param(page_reporting_order, uint, 0644);
+/* Initialize to an unsupported value */
+unsigned int page_reporting_order = -1;
+
+static int page_order_update_notify(const char *val, const struct kernel_param *kp)
+{
+ /*
+ * If param is set beyond this limit, order is set to default
+ * pageblock_order value
+ */
+ return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1);
+}
+
+static const struct kernel_param_ops page_reporting_param_ops = {
+ .set = &page_order_update_notify,
+ /*
+ * For the get op, use param_get_int instead of param_get_uint.
+ * This is to make sure that when unset the initialized value of
+ * -1 is shown correctly
+ */
+ .get = &param_get_int,
+};
+
+module_param_cb(page_reporting_order, &page_reporting_param_ops,
+ &page_reporting_order, 0644);
MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
+/*
+ * This symbol is also a kernel parameter. Export the page_reporting_order
+ * symbol so that other drivers can access it to control order values without
+ * having to introduce another configurable parameter. Only one driver can
+ * register with the page_reporting driver for the service, so we have just
+ * one control parameter for the use case(which can be accessed in both
+ * drivers)
+ */
+EXPORT_SYMBOL_GPL(page_reporting_order);
+
#define PAGE_REPORTING_DELAY (2 * HZ)
static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
@@ -330,10 +362,18 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
}
/*
- * Update the page reporting order if it's specified by driver.
- * Otherwise, it falls back to @pageblock_order.
+ * If the page_reporting_order value is not set, we check if
+ * an order is provided from the driver that is performing the
+ * registration. If that is not provided either, we default to
+ * pageblock_order.
*/
- page_reporting_order = prdev->order ? : pageblock_order;
+
+ if (page_reporting_order == -1) {
+ if (prdev->order > 0 && prdev->order <= MAX_ORDER)
+ page_reporting_order = prdev->order;
+ else
+ page_reporting_order = pageblock_order;
+ }
/* initialize state and work structures */
atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index e2062748791a..93e633c1d587 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -4,6 +4,7 @@
* Copyright (c) 2021, Google LLC.
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
+#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/page_table_check.h>
@@ -23,7 +24,7 @@ EXPORT_SYMBOL(page_table_check_disabled);
static int __init early_page_table_check_param(char *buf)
{
- return strtobool(buf, &__page_table_check_enabled);
+ return kstrtobool(buf, &__page_table_check_enabled);
}
early_param("page_table_check", early_page_table_check_param);
@@ -53,7 +54,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
}
/*
- * An enty is removed from the page table, decrement the counters for that page
+ * An entry is removed from the page table, decrement the counters for that page
* verify that it is of correct type and counters do not become negative.
*/
static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
@@ -68,7 +69,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
return;
page = pfn_to_page(pfn);
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -83,10 +84,11 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
}
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
/*
- * A new enty is added to the page table, increment the counters for that page
+ * A new entry is added to the page table, increment the counters for that page
* verify that it is of correct type and is not being mapped with a different
* type to a different process.
*/
@@ -103,7 +105,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
return;
page = pfn_to_page(pfn);
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -118,6 +120,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
}
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
/*
@@ -126,9 +129,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
unsigned long i;
+ page_ext = page_ext_get(page);
BUG_ON(!page_ext);
for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
@@ -137,6 +141,7 @@ void __page_table_check_zero(struct page *page, unsigned int order)
BUG_ON(atomic_read(&ptc->file_map_count));
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8e9e574d535a..93e13fc17d3c 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -86,7 +86,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
!is_device_exclusive_entry(entry))
return false;
- pfn = swp_offset(entry);
+ pfn = swp_offset_pfn(entry);
} else if (is_swap_pte(*pvmw->pte)) {
swp_entry_t entry;
@@ -96,7 +96,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
!is_device_exclusive_entry(entry))
return false;
- pfn = swp_offset(entry);
+ pfn = swp_offset_pfn(entry);
} else {
if (!pte_present(*pvmw->pte))
return false;
@@ -221,7 +221,7 @@ restart:
return not_found(pvmw);
entry = pmd_to_swp_entry(pmde);
if (!is_migration_entry(entry) ||
- !check_pmd(swp_offset(entry), pvmw))
+ !check_pmd(swp_offset_pfn(entry), pvmw))
return not_found(pvmw);
return true;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index fa7a3d21a751..7f1c9b274906 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -460,7 +460,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
} else { /* inside vma */
walk.vma = vma;
next = min(end, vma->vm_end);
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
err = walk_page_test(start, next, &walk);
if (err > 0) {
@@ -482,7 +482,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
-/*
+/**
+ * walk_page_range_novma - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
@@ -509,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
return walk_pgd_range(start, end, &walk);
}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ mmap_assert_locked(walk.mm);
+ return __walk_page_range(start, end, &walk);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
@@ -518,18 +546,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
.vma = vma,
.private = private,
};
- int err;
if (!walk.mm)
return -EINVAL;
mmap_assert_locked(walk.mm);
-
- err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
- if (err > 0)
- return 0;
- if (err < 0)
- return err;
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 27697b2429c2..acd78da0493b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -72,7 +72,6 @@
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
-#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
@@ -174,9 +173,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext
struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
-/* chunks which need their map areas extended, protected by pcpu_lock */
-static LIST_HEAD(pcpu_map_extend_chunks);
-
/*
* The number of empty populated pages, protected by pcpu_lock.
* The reserved chunk doesn't contribute to the count.
@@ -834,13 +830,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
/*
* Update s_block.
- * block->first_free must be updated if the allocation takes its place.
- * If the allocation breaks the contig_hint, a scan is required to
- * restore this hint.
*/
if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
+ /*
+ * block->first_free must be updated if the allocation takes its place.
+ * If the allocation breaks the contig_hint, a scan is required to
+ * restore this hint.
+ */
if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
@@ -915,6 +913,12 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
}
}
+ /*
+ * If the allocation is not atomic, some blocks may not be
+ * populated with pages, while we account it here. The number
+ * of pages will be added back with pcpu_chunk_populated()
+ * when populating pages.
+ */
if (nr_empty_pages)
pcpu_update_empty_pages(chunk, -nr_empty_pages);
@@ -1342,7 +1346,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
int map_size)
{
struct pcpu_chunk *chunk;
- unsigned long aligned_addr, lcm_align;
+ unsigned long aligned_addr;
int start_offset, offset_bits, region_size, region_bits;
size_t alloc_size;
@@ -1350,14 +1354,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
aligned_addr = tmp_addr & PAGE_MASK;
start_offset = tmp_addr - aligned_addr;
-
- /*
- * Align the end of the region with the LCM of PAGE_SIZE and
- * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
- * the other.
- */
- lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
- region_size = ALIGN(start_offset + map_size, lcm_align);
+ region_size = ALIGN(start_offset + map_size, PAGE_SIZE);
/* allocate chunk */
alloc_size = struct_size(chunk, populated,
@@ -1820,16 +1817,12 @@ restart:
spin_unlock_irqrestore(&pcpu_lock, flags);
- /*
- * No space left. Create a new chunk. We don't want multiple
- * tasks to create chunks simultaneously. Serialize and create iff
- * there's still no empty chunk after grabbing the mutex.
- */
if (is_atomic) {
err = "atomic alloc failed, no space left";
goto fail;
}
+ /* No space left. Create a new chunk. */
if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) {
@@ -2146,9 +2139,9 @@ static void pcpu_reclaim_populated(void)
* other accessor is the free path which only returns area back to the
* allocator not touching the populated bitmap.
*/
- while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) {
- chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot],
- struct pcpu_chunk, list);
+ while ((chunk = list_first_entry_or_null(
+ &pcpu_chunk_lists[pcpu_to_depopulate_slot],
+ struct pcpu_chunk, list))) {
WARN_ON(chunk->immutable);
/*
@@ -2166,7 +2159,7 @@ static void pcpu_reclaim_populated(void)
/* reintegrate chunk to prevent atomic alloc failures */
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
reintegrate = true;
- goto end_chunk;
+ break;
}
/*
@@ -2202,7 +2195,6 @@ static void pcpu_reclaim_populated(void)
end = -1;
}
-end_chunk:
/* batch tlb flush per chunk to amortize cost */
if (freed_page_start < freed_page_end) {
spin_unlock_irq(&pcpu_lock);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 4bcc11958089..78dfaf9e8990 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -263,7 +263,7 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec *iov_r;
struct iov_iter iter;
ssize_t rc;
- int dir = vm_write ? WRITE : READ;
+ int dir = vm_write ? ITER_SOURCE : ITER_DEST;
if (flags != 0)
return -EINVAL;
diff --git a/mm/readahead.c b/mm/readahead.c
index fdcd28cbd92d..b10f0cf81d80 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -122,6 +122,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
+#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
@@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac)
if (!readahead_count(rac))
return;
+ if (unlikely(rac->_workingset))
+ psi_memstall_enter(&rac->_pflags);
blk_start_plug(&plug);
if (aops->readahead) {
@@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac)
}
blk_finish_plug(&plug);
+ if (unlikely(rac->_workingset))
+ psi_memstall_leave(&rac->_pflags);
+ rac->_workingset = false;
BUG_ON(readahead_count(rac));
}
@@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
}
if (i == nr_to_read - lookahead_size)
folio_set_readahead(folio);
+ ractl->_workingset |= folio_test_workingset(folio);
ractl->_nr_pages++;
}
@@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
- if (err)
+ if (err) {
folio_put(folio);
- else
- ractl->_nr_pages += 1UL << order;
- return err;
+ return err;
+ }
+
+ ractl->_nr_pages += 1UL << order;
+ ractl->_workingset |= folio_test_workingset(folio);
+ return 0;
}
void page_cache_ra_order(struct readahead_control *ractl,
@@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl,
put_page(page);
return;
}
+ if (unlikely(PageWorkingset(page)) && !ractl->_workingset) {
+ ractl->_workingset = true;
+ psi_memstall_enter(&ractl->_pflags);
+ }
ractl->_nr_pages++;
if (ra) {
ra->size++;
diff --git a/mm/rmap.c b/mm/rmap.c
index 93d5a6f793d2..b616870a09be 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,10 +23,9 @@
* inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
- * page->flags PG_locked (lock_page) * (see hugetlbfs below)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * page->flags PG_locked (lock_page)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* swap_lock (in swap_duplicate, swap_info_get)
@@ -46,10 +45,11 @@
* ->tasklist_lock
* pte map lock
*
- * * hugetlbfs PageHuge() pages take locks in this order:
- * mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * page->flags PG_locked (lock_page)
+ * hugetlbfs PageHuge() take locks in this order:
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * vma_lock (hugetlb specific lock for pmd_sharing)
+ * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
@@ -315,8 +315,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
enomem_failure:
/*
- * dst->anon_vma is dropped here otherwise its degree can be incorrectly
- * decremented in unlink_anon_vmas().
+ * dst->anon_vma is dropped here otherwise its num_active_vmas can
+ * be incorrectly decremented in unlink_anon_vmas().
* We can safely do this because callers of anon_vma_clone() don't care
* about dst->anon_vma if anon_vma_clone() failed.
*/
@@ -489,16 +489,16 @@ void __init anon_vma_init(void)
* if there is a mapcount, we can dereference the anon_vma after observing
* those.
*/
-struct anon_vma *page_get_anon_vma(struct page *page)
+struct anon_vma *folio_get_anon_vma(struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
@@ -508,13 +508,13 @@ struct anon_vma *page_get_anon_vma(struct page *page)
}
/*
- * If this page is still mapped, then its anon_vma cannot have been
+ * If this folio is still mapped, then its anon_vma cannot have been
* freed. But if it has been unmapped, we have no security against the
* anon_vma structure being freed and reused (for another anon_vma:
* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
* above cannot corrupt).
*/
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
@@ -526,11 +526,11 @@ out:
}
/*
- * Similar to page_get_anon_vma() except it locks the anon_vma.
+ * Similar to folio_get_anon_vma() except it locks the anon_vma.
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex
+ * reference like with folio_get_anon_vma() and then block on the mutex
* on !rwc->try_lock case.
*/
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
@@ -602,11 +602,6 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
-{
- anon_vma_unlock_read(anon_vma);
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
@@ -770,13 +765,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return vma_address(page, vma);
}
+/*
+ * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
+ * NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
+ * represents.
+ */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
- pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -791,15 +790,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- /*
- * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
- * without holding anon_vma lock for write. So when looking for a
- * genuine pmde (in which to find pte), test present and !THP together.
- */
- pmde = *pmd;
- barrier();
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
- pmd = NULL;
out:
return pmd;
}
@@ -833,6 +823,12 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
+ }
+
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
@@ -1089,6 +1085,29 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
+int total_compound_mapcount(struct page *head)
+{
+ int mapcount = head_compound_mapcount(head);
+ int nr_subpages;
+ int i;
+
+ /* In the common case, avoid the loop when no subpages mapped by PTE */
+ if (head_subpages_mapcount(head) == 0)
+ return mapcount;
+ /*
+ * Add all the PTE mappings of those subpages mapped by PTE.
+ * Limit the loop, knowing that only subpages_mapcount are mapped?
+ * Perhaps: given all the raciness, that may be a good or a bad idea.
+ */
+ nr_subpages = thp_nr_pages(head);
+ for (i = 0; i < nr_subpages; i++)
+ mapcount += atomic_read(&head[i]._mapcount);
+
+ /* But each of those _mapcounts was based on -1 */
+ mapcount += nr_subpages;
+ return mapcount;
+}
+
/**
* page_move_anon_rmap - move a page to our anon_vma
* @page: the page to move to our anon_vma
@@ -1101,22 +1120,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
*/
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
- struct page *subpage = page;
-
- page = compound_head(page);
+ void *anon_vma = vma->anon_vma;
+ struct folio *folio = page_folio(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ anon_vma += PAGE_MAPPING_ANON;
/*
* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
* simultaneously, so a concurrent reader (eg folio_referenced()'s
* folio_test_anon()) will not see one without the other.
*/
- WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
- SetPageAnonExclusive(subpage);
+ WRITE_ONCE(folio->mapping, anon_vma);
+ SetPageAnonExclusive(page);
}
/**
@@ -1200,38 +1217,50 @@ static void __page_check_anon_rmap(struct page *page,
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
+ atomic_t *mapped;
+ int nr = 0, nr_pmdmapped = 0;
bool compound = flags & RMAP_COMPOUND;
- bool first;
+ bool first = true;
if (unlikely(PageKsm(page)))
lock_page_memcg(page);
- else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (compound) {
- atomic_t *mapcount;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- mapcount = compound_mapcount_ptr(page);
- first = atomic_inc_and_test(mapcount);
- } else {
+ /* Is page being mapped by PTE? Is this its first map to be added? */
+ if (likely(!compound)) {
first = atomic_inc_and_test(&page->_mapcount);
+ nr = first;
+ if (first && PageCompound(page)) {
+ mapped = subpages_mapcount_ptr(compound_head(page));
+ nr = atomic_inc_return_relaxed(mapped);
+ nr = (nr < COMPOUND_MAPPED);
+ }
+ } else if (PageTransHuge(page)) {
+ /* That test is redundant: it's for safety or to optimize out */
+
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ if (first) {
+ mapped = subpages_mapcount_ptr(page);
+ nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
+ if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
+ nr_pmdmapped = thp_nr_pages(page);
+ nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED);
+ /* Raced ahead of a remove and another add? */
+ if (unlikely(nr < 0))
+ nr = 0;
+ } else {
+ /* Raced ahead of a remove of COMPOUND_MAPPED */
+ nr = 0;
+ }
+ }
}
+
VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
- if (first) {
- int nr = compound ? thp_nr_pages(page) : 1;
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption
- * disabled.
- */
- if (compound)
- __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
+ if (nr_pmdmapped)
+ __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped);
+ if (nr)
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
- }
if (unlikely(PageKsm(page)))
unlock_page_memcg(page);
@@ -1262,22 +1291,24 @@ void page_add_anon_rmap(struct page *page,
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- const bool compound = PageCompound(page);
- int nr = compound ? thp_nr_pages(page) : 1;
+ int nr;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
- if (compound) {
+
+ if (likely(!PageCompound(page))) {
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ nr = 1;
+ } else {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
-
+ atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED);
+ nr = thp_nr_pages(page);
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
- } else {
- /* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
}
+
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1293,45 +1324,45 @@ void page_add_new_anon_rmap(struct page *page,
void page_add_file_rmap(struct page *page,
struct vm_area_struct *vma, bool compound)
{
- int i, nr = 0;
+ atomic_t *mapped;
+ int nr = 0, nr_pmdmapped = 0;
+ bool first;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
- if (compound && PageTransHuge(page)) {
- int nr_pages = thp_nr_pages(page);
- for (i = 0; i < nr_pages; i++) {
- if (atomic_inc_and_test(&page[i]._mapcount))
- nr++;
+ /* Is page being mapped by PTE? Is this its first map to be added? */
+ if (likely(!compound)) {
+ first = atomic_inc_and_test(&page->_mapcount);
+ nr = first;
+ if (first && PageCompound(page)) {
+ mapped = subpages_mapcount_ptr(compound_head(page));
+ nr = atomic_inc_return_relaxed(mapped);
+ nr = (nr < COMPOUND_MAPPED);
}
- if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
- goto out;
-
- /*
- * It is racy to ClearPageDoubleMap in page_remove_file_rmap();
- * but page lock is held by all page_add_file_rmap() compound
- * callers, and SetPageDoubleMap below warns if !PageLocked:
- * so here is a place that DoubleMap can be safely cleared.
- */
- VM_WARN_ON_ONCE(!PageLocked(page));
- if (nr == nr_pages && PageDoubleMap(page))
- ClearPageDoubleMap(page);
-
- if (PageSwapBacked(page))
- __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
- nr_pages);
- else
- __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
- nr_pages);
- } else {
- if (PageTransCompound(page) && page_mapping(page)) {
- VM_WARN_ON_ONCE(!PageLocked(page));
- SetPageDoubleMap(compound_head(page));
+ } else if (PageTransHuge(page)) {
+ /* That test is redundant: it's for safety or to optimize out */
+
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ if (first) {
+ mapped = subpages_mapcount_ptr(page);
+ nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
+ if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
+ nr_pmdmapped = thp_nr_pages(page);
+ nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED);
+ /* Raced ahead of a remove and another add? */
+ if (unlikely(nr < 0))
+ nr = 0;
+ } else {
+ /* Raced ahead of a remove of COMPOUND_MAPPED */
+ nr = 0;
+ }
}
- if (atomic_inc_and_test(&page->_mapcount))
- nr++;
}
-out:
+
+ if (nr_pmdmapped)
+ __mod_lruvec_page_state(page, PageSwapBacked(page) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
if (nr)
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
unlock_page_memcg(page);
@@ -1339,132 +1370,87 @@ out:
mlock_vma_page(page, vma, compound);
}
-static void page_remove_file_rmap(struct page *page, bool compound)
+/**
+ * page_remove_rmap - take down pte mapping from a page
+ * @page: page to remove mapping from
+ * @vma: the vm area from which the mapping is removed
+ * @compound: uncharge the page as compound or small page
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_remove_rmap(struct page *page,
+ struct vm_area_struct *vma, bool compound)
{
- int i, nr = 0;
+ atomic_t *mapped;
+ int nr = 0, nr_pmdmapped = 0;
+ bool last;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ /* Hugetlb pages are not counted in NR_*MAPPED */
if (unlikely(PageHuge(page))) {
/* hugetlb pages are always mapped with pmds */
atomic_dec(compound_mapcount_ptr(page));
return;
}
- /* page still mapped by someone else? */
- if (compound && PageTransHuge(page)) {
- int nr_pages = thp_nr_pages(page);
+ lock_page_memcg(page);
- for (i = 0; i < nr_pages; i++) {
- if (atomic_add_negative(-1, &page[i]._mapcount))
- nr++;
+ /* Is page being unmapped by PTE? Is this its last map to be removed? */
+ if (likely(!compound)) {
+ last = atomic_add_negative(-1, &page->_mapcount);
+ nr = last;
+ if (last && PageCompound(page)) {
+ mapped = subpages_mapcount_ptr(compound_head(page));
+ nr = atomic_dec_return_relaxed(mapped);
+ nr = (nr < COMPOUND_MAPPED);
}
- if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- goto out;
- if (PageSwapBacked(page))
- __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
- -nr_pages);
- else
- __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
- -nr_pages);
- } else {
- if (atomic_add_negative(-1, &page->_mapcount))
- nr++;
- }
-out:
- if (nr)
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
-}
-
-static void page_remove_anon_compound_rmap(struct page *page)
-{
- int i, nr;
-
- if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- return;
-
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
- return;
-
- __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
-
- if (TestClearPageDoubleMap(page)) {
- /*
- * Subpages can be mapped with PTEs too. Check how many of
- * them are still mapped.
- */
- for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
- if (atomic_add_negative(-1, &page[i]._mapcount))
- nr++;
+ } else if (PageTransHuge(page)) {
+ /* That test is redundant: it's for safety or to optimize out */
+
+ last = atomic_add_negative(-1, compound_mapcount_ptr(page));
+ if (last) {
+ mapped = subpages_mapcount_ptr(page);
+ nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped);
+ if (likely(nr < COMPOUND_MAPPED)) {
+ nr_pmdmapped = thp_nr_pages(page);
+ nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED);
+ /* Raced ahead of another remove and an add? */
+ if (unlikely(nr < 0))
+ nr = 0;
+ } else {
+ /* An add of COMPOUND_MAPPED raced ahead */
+ nr = 0;
+ }
}
+ }
+ if (nr_pmdmapped) {
+ __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS :
+ (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED :
+ NR_FILE_PMDMAPPED), -nr_pmdmapped);
+ }
+ if (nr) {
+ __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED :
+ NR_FILE_MAPPED, -nr);
/*
- * Queue the page for deferred split if at least one small
+ * Queue anon THP for deferred split if at least one small
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
- if (nr && nr < thp_nr_pages(page))
- deferred_split_huge_page(page);
- } else {
- nr = thp_nr_pages(page);
- }
-
- if (nr)
- __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
-}
-
-/**
- * page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
- * @vma: the vm area from which the mapping is removed
- * @compound: uncharge the page as compound or small page
- *
- * The caller needs to hold the pte lock.
- */
-void page_remove_rmap(struct page *page,
- struct vm_area_struct *vma, bool compound)
-{
- lock_page_memcg(page);
-
- if (!PageAnon(page)) {
- page_remove_file_rmap(page, compound);
- goto out;
+ if (PageTransCompound(page) && PageAnon(page))
+ if (!compound || nr < nr_pmdmapped)
+ deferred_split_huge_page(compound_head(page));
}
- if (compound) {
- page_remove_anon_compound_rmap(page);
- goto out;
- }
-
- /* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
-
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
- */
- __dec_lruvec_page_state(page, NR_ANON_MAPPED);
-
- if (PageTransCompound(page))
- deferred_split_huge_page(compound_head(page));
-
/*
- * It would be tidy to reset the PageAnon mapping here,
+ * It would be tidy to reset PageAnon mapping when fully unmapped,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_unref_page,
+ * before us: so leave the reset to free_pages_prepare,
* and remember that it's only reliable while mapped.
- * Leaving it set also helps swapoff to reinstate ptes
- * faster for those pages still in swapcache.
*/
-out:
+
unlock_page_memcg(page);
munlock_vma_page(page, vma, compound);
@@ -1560,33 +1546,45 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and fail
+ * if unsuccessful.
*/
- VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
- if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- /*
- * Nuke the page table entry. When having to clear
- * PageAnonExclusive(), we always have to flush.
- */
- if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -1717,6 +1715,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
swap_free(entry);
@@ -1793,7 +1793,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return vma_is_temporary_stack(vma);
}
-static int page_not_mapped(struct folio *folio)
+static int folio_not_mapped(struct folio *folio)
{
return !folio_mapped(folio);
}
@@ -1814,7 +1814,7 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
@@ -1936,26 +1936,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and
+ * fail if unsuccessful.
*/
- VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
- if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
}
-
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
@@ -2048,6 +2063,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
!anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))
@@ -2073,7 +2090,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
-
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
@@ -2122,7 +2142,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
@@ -2269,7 +2289,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
};
struct rmap_walk_control rwc = {
.rmap_one = page_make_device_exclusive_one,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
.arg = &args,
};
@@ -2541,9 +2561,9 @@ void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
-
+ ClearHPageRestoreReserve(page);
__page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 2613371945b7..6d783436951f 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,13 +9,13 @@
#include <linux/rodata_test.h>
#include <linux/uaccess.h>
+#include <linux/mm.h>
#include <asm/sections.h>
static const int rodata_test_data = 0xC3;
void rodata_test(void)
{
- unsigned long start, end;
int zero = 0;
/* test 1: read the value */
@@ -39,13 +39,11 @@ void rodata_test(void)
}
/* test 4: check if the rodata section is PAGE_SIZE aligned */
- start = (unsigned long)__start_rodata;
- end = (unsigned long)__end_rodata;
- if (start & (PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(__start_rodata)) {
pr_err("start of .rodata is not page size aligned\n");
return;
}
- if (end & (PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(__end_rodata)) {
pr_err("end of .rodata is not page size aligned\n");
return;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index e3e9590c6fb3..04c3ac9448a1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -276,20 +276,18 @@ static struct file_system_type secretmem_fs = {
.kill_sb = kill_anon_super,
};
-static int secretmem_init(void)
+static int __init secretmem_init(void)
{
- int ret = 0;
-
if (!secretmem_enable)
- return ret;
+ return 0;
secretmem_mnt = kern_mount(&secretmem_fs);
if (IS_ERR(secretmem_mnt))
- ret = PTR_ERR(secretmem_mnt);
+ return PTR_ERR(secretmem_mnt);
/* prevent secretmem mappings from ever getting PROT_EXEC */
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
- return ret;
+ return 0;
}
fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index 42e5888bf84d..c301487be5fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,6 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
+#include <linux/iversion.h>
#include "swap.h"
static struct vfsmount *shm_mnt;
@@ -139,17 +140,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type);
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
- struct vm_fault *vmf, vm_fault_t *fault_type);
-
-int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp)
-{
- return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
-}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -190,7 +180,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow large sparse files.
- * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
static inline int shmem_acct_block(unsigned long flags, long pages)
@@ -247,11 +237,17 @@ static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
+static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
+bool vma_is_anon_shmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &shmem_anon_vm_ops;
+}
+
bool vma_is_shmem(struct vm_area_struct *vma)
{
- return vma->vm_ops == &shmem_vm_ops;
+ return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}
static LIST_HEAD(shmem_swaplist);
@@ -472,20 +468,22 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-bool shmem_is_huge(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
+bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force)
{
loff_t i_size;
if (!S_ISREG(inode->i_mode))
return false;
- if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
return false;
+ if (shmem_huge_force)
+ return true;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
@@ -629,7 +627,7 @@ next:
goto move_back;
}
- ret = split_huge_page(&folio->page);
+ ret = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
@@ -680,8 +678,8 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
-bool shmem_is_huge(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
+bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force)
{
return false;
}
@@ -763,23 +761,22 @@ error:
}
/*
- * Like delete_from_page_cache, but substitutes swap for page.
+ * Like delete_from_page_cache, but substitutes swap for @folio.
*/
-static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
+ long nr = folio_nr_pages(folio);
int error;
- VM_BUG_ON_PAGE(PageCompound(page), page);
-
xa_lock_irq(&mapping->i_pages);
- error = shmem_replace_entry(mapping, page->index, page, radswap);
- page->mapping = NULL;
- mapping->nrpages--;
- __dec_lruvec_page_state(page, NR_FILE_PAGES);
- __dec_lruvec_page_state(page, NR_SHMEM);
+ error = shmem_replace_entry(mapping, folio->index, folio, radswap);
+ folio->mapping = NULL;
+ mapping->nrpages -= nr;
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
xa_unlock_irq(&mapping->i_pages);
- put_page(page);
+ folio_put(folio);
BUG_ON(error);
}
@@ -886,10 +883,9 @@ void shmem_unlock_mapping(struct address_space *mapping)
static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
struct folio *folio;
- struct page *page;
/*
- * At first avoid shmem_getpage(,,,SGP_READ): that fails
+ * At first avoid shmem_get_folio(,,,SGP_READ): that fails
* beyond i_size, and reports fallocated pages as holes.
*/
folio = __filemap_get_folio(inode->i_mapping, index,
@@ -900,9 +896,9 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* But read a page back from swap if any of it is within i_size
* (although in some cases this is just a waste of time).
*/
- page = NULL;
- shmem_getpage(inode, index, &page, SGP_READ);
- return page ? page_folio(page) : NULL;
+ folio = NULL;
+ shmem_get_folio(inode, index, &folio, SGP_READ);
+ return folio;
}
/*
@@ -932,21 +928,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
-
if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
if (!unfalloc || !folio_test_uptodate(folio))
truncate_inode_folio(mapping, folio);
@@ -955,9 +948,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
+ /*
+ * When undoing a failed fallocate, we want none of the partial folio
+ * zeroing and splitting below, but shall want to truncate the whole
+ * folio when !uptodate indicates that it was added by this fallocate,
+ * even when [lstart, lend] covers only a part of the folio.
+ */
+ if (unfalloc)
+ goto whole_folios;
+
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
if (folio) {
@@ -983,11 +984,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_put(folio);
}
+whole_folios:
+
index = start;
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
@@ -999,13 +1002,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, folio)) {
+ if (shmem_free_swap(mapping, indices[i], folio)) {
/* Swap was replaced by page: retry */
- index--;
+ index = indices[i];
break;
}
nr_swaps_freed++;
@@ -1018,19 +1020,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
folio_unlock(folio);
- index--;
+ index = indices[i];
break;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
truncate_inode_folio(mapping, folio);
}
- index = folio->index + folio_nr_pages(folio) - 1;
folio_unlock(folio);
}
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
- index++;
}
spin_lock_irq(&info->lock);
@@ -1043,6 +1043,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1069,7 +1070,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
STATX_ATTR_NODUMP);
generic_fillattr(&init_user_ns, inode, stat);
- if (shmem_is_huge(NULL, inode, 0))
+ if (shmem_is_huge(NULL, inode, 0, false))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1087,6 +1088,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
int error;
+ bool update_mtime = false;
+ bool update_ctime = true;
error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
@@ -1107,7 +1110,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
if (error)
return error;
i_size_write(inode, newsize);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ update_mtime = true;
+ } else {
+ update_ctime = false;
}
if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -1126,7 +1131,13 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
+ if (!error && update_ctime) {
+ inode->i_ctime = current_time(inode);
+ if (update_mtime)
+ inode->i_mtime = inode->i_ctime;
+ inode_inc_iversion(inode);
+ }
return error;
}
@@ -1328,17 +1339,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
* "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
* and its shmem_writeback() needs them to be split when swapping.
*/
- if (PageTransCompound(page)) {
+ if (folio_test_large(folio)) {
/* Ensure the subpages are still dirty */
- SetPageDirty(page);
+ folio_test_set_dirty(folio);
if (split_huge_page(page) < 0)
goto redirty;
- ClearPageDirty(page);
+ folio = page_folio(page);
+ folio_clear_dirty(folio);
}
- BUG_ON(!PageLocked(page));
- mapping = page->mapping;
- index = page->index;
+ BUG_ON(!folio_test_locked(folio));
+ mapping = folio->mapping;
+ index = folio->index;
inode = mapping->host;
info = SHMEM_I(inode);
if (info->flags & VM_LOCKED)
@@ -1361,15 +1373,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
* value into swapfile.c, the only way we can correctly account for a
- * fallocated page arriving here is now to initialize it and write it.
+ * fallocated folio arriving here is now to initialize it and write it.
*
- * That's okay for a page already fallocated earlier, but if we have
+ * That's okay for a folio already fallocated earlier, but if we have
* not yet completed the fallocation, then (a) we want to keep track
- * of this page in case we have to undo it, and (b) it may not be a
+ * of this folio in case we have to undo it, and (b) it may not be a
* good idea to continue anyway, once we're pushing into swap. So
- * reactivate the page, and let shmem_fallocate() quit when too many.
+ * reactivate the folio, and let shmem_fallocate() quit when too many.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (inode->i_private) {
struct shmem_falloc *shmem_falloc;
spin_lock(&inode->i_lock);
@@ -1385,9 +1397,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (shmem_falloc)
goto redirty;
}
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
swap = folio_alloc_swap(folio);
@@ -1396,7 +1408,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the page is
+ * if it's not already there. Do it now before the folio is
* moved to swap cache, when its pagelock no longer protects
* the inode from eviction. But don't unlock the mutex until
* we've incremented swapped, because shmem_unuse_inode() will
@@ -1406,7 +1418,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap,
+ if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
spin_lock_irq(&info->lock);
@@ -1415,21 +1427,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_unlock_irq(&info->lock);
swap_shmem_alloc(swap);
- shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
mutex_unlock(&shmem_swaplist_mutex);
- BUG_ON(page_mapped(page));
- swap_writepage(page, wbc);
+ BUG_ON(folio_mapped(folio));
+ swap_writepage(&folio->page, wbc);
return 0;
}
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_page(page, swap);
+ put_swap_folio(folio, swap);
redirty:
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
- unlock_page(page);
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
+ folio_unlock(folio);
return 0;
}
@@ -1486,7 +1498,7 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
mpol_cond_put(vma->vm_policy);
}
-static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
@@ -1499,7 +1511,9 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ if (!page)
+ return NULL;
+ return page_folio(page);
}
/*
@@ -1560,12 +1574,6 @@ static struct folio *shmem_alloc_folio(gfp_t gfp,
return folio;
}
-static struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return &shmem_alloc_folio(gfp, info, index)->page;
-}
-
static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
pgoff_t index, bool huge)
{
@@ -1599,7 +1607,7 @@ failed:
/*
* When a page is moved from swapcache to shmem filecache (either by the
- * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
* shmem_unuse_inode()), it may have been read in earlier from swap, in
* ignorance of the mapping it belongs to. If that mapping has special
* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
@@ -1614,54 +1622,52 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
return folio_zonenum(folio) > gfp_zone(gfp);
}
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct page *oldpage, *newpage;
struct folio *old, *new;
struct address_space *swap_mapping;
swp_entry_t entry;
pgoff_t swap_index;
int error;
- oldpage = *pagep;
- entry.val = page_private(oldpage);
+ old = *foliop;
+ entry = folio_swap_entry(old);
swap_index = swp_offset(entry);
- swap_mapping = page_mapping(oldpage);
+ swap_mapping = swap_address_space(entry);
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- newpage = shmem_alloc_page(gfp, info, index);
- if (!newpage)
+ VM_BUG_ON_FOLIO(folio_test_large(old), old);
+ new = shmem_alloc_folio(gfp, info, index);
+ if (!new)
return -ENOMEM;
- get_page(newpage);
- copy_highpage(newpage, oldpage);
- flush_dcache_page(newpage);
+ folio_get(new);
+ folio_copy(new, old);
+ flush_dcache_folio(new);
- __SetPageLocked(newpage);
- __SetPageSwapBacked(newpage);
- SetPageUptodate(newpage);
- set_page_private(newpage, entry.val);
- SetPageSwapCache(newpage);
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ folio_mark_uptodate(new);
+ folio_set_swap_entry(new, entry);
+ folio_set_swapcache(new);
/*
* Our caller will very soon move newpage out of swapcache, but it's
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, old, new);
if (!error) {
- old = page_folio(oldpage);
- new = page_folio(newpage);
mem_cgroup_migrate(old, new);
- __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
- __inc_lruvec_page_state(newpage, NR_SHMEM);
- __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
- __dec_lruvec_page_state(oldpage, NR_SHMEM);
+ __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
+ __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
+ __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
+ __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1671,18 +1677,17 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* both PageSwapCache and page_private after getting page lock;
* but be defensive. Reverse old to newpage for clear and free.
*/
- oldpage = newpage;
+ old = new;
} else {
- lru_cache_add(newpage);
- *pagep = newpage;
+ folio_add_lru(new);
+ *foliop = new;
}
- ClearPageSwapCache(oldpage);
- set_page_private(oldpage, 0);
+ folio_clear_swapcache(old);
+ old->private = NULL;
- unlock_page(oldpage);
- put_page(oldpage);
- put_page(oldpage);
+ folio_unlock(old);
+ folio_put_refs(old, 2);
return error;
}
@@ -1694,7 +1699,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
swp_entry_t swapin_error;
void *old;
- swapin_error = make_swapin_error_entry(&folio->page);
+ swapin_error = make_swapin_error_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0);
@@ -1730,7 +1735,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
- struct page *page;
struct folio *folio = NULL;
swp_entry_t swap;
int error;
@@ -1743,8 +1747,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
return -EIO;
/* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
+ folio = swap_cache_get_folio(swap, NULL, 0);
+ if (!folio) {
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
@@ -1752,13 +1756,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
count_memcg_event_mm(charge_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
+ folio = shmem_swapin(swap, gfp, info, index);
+ if (!folio) {
error = -ENOMEM;
goto failed;
}
}
- folio = page_folio(page);
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
@@ -1781,8 +1784,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
arch_swap_restore(swap, folio);
if (shmem_should_replace_folio(folio, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
- folio = page_folio(page);
+ error = shmem_replace_folio(&folio, gfp, info, index);
if (error)
goto failed;
}
@@ -1822,7 +1824,7 @@ unlock:
}
/*
- * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
+ * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
@@ -1831,17 +1833,17 @@ unlock:
* vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf,
- vm_fault_t *fault_type)
+static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct folio *folio;
- pgoff_t hindex = index;
+ pgoff_t hindex;
gfp_t huge_gfp;
int error;
int once = 0;
@@ -1874,17 +1876,16 @@ repeat:
if (error == -EEXIST)
goto repeat;
- *pagep = &folio->page;
+ *foliop = folio;
return error;
}
if (folio) {
- hindex = folio->index;
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
goto out;
- /* fallocated page */
+ /* fallocated folio */
if (sgp != SGP_READ)
goto clear;
folio_unlock(folio);
@@ -1892,10 +1893,10 @@ repeat:
}
/*
- * SGP_READ: succeed on hole, with NULL page, letting caller zero.
- * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail.
+ * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
+ * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
*/
- *pagep = NULL;
+ *foliop = NULL;
if (sgp == SGP_READ)
return 0;
if (sgp == SGP_NOALLOC)
@@ -1910,7 +1911,7 @@ repeat:
return 0;
}
- if (!shmem_is_huge(vma, inode, index))
+ if (!shmem_is_huge(vma, inode, index, false))
goto alloc_nohuge;
huge_gfp = vma_thp_gfp_mask(vma);
@@ -1928,7 +1929,7 @@ alloc_nohuge:
if (error != -ENOSPC)
goto unlock;
/*
- * Try to reclaim some space by splitting a huge page
+ * Try to reclaim some space by splitting a large folio
* beyond i_size on the filesystem.
*/
while (retry--) {
@@ -1964,9 +1965,9 @@ alloc_nohuge:
if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
+ folio_next_index(folio) - 1) {
/*
- * Part of the huge page is beyond i_size: subject
+ * Part of the large folio is beyond i_size: subject
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
@@ -1983,14 +1984,14 @@ alloc_nohuge:
}
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/
if (sgp == SGP_FALLOC)
sgp = SGP_WRITE;
clear:
/*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
+ * Let SGP_WRITE caller clear ends if write does not fill folio;
+ * but SGP_FALLOC on a folio fallocated earlier must initialize
* it now, lest undo on failure cancel our earlier guarantee.
*/
if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
@@ -2016,7 +2017,7 @@ clear:
goto unlock;
}
out:
- *pagep = folio_page(folio, index - hindex);
+ *foliop = folio;
return 0;
/*
@@ -2046,6 +2047,13 @@ unlock:
return error;
}
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+ enum sgp_type sgp)
+{
+ return shmem_get_folio_gfp(inode, index, foliop, sgp,
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+}
+
/*
* This is like autoremove_wake_function, but it removes the wait queue
* entry unconditionally - even if something else had already woken the
@@ -2063,6 +2071,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
+ struct folio *folio = NULL;
int err;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -2125,10 +2134,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
gfp, vma, vmf, &ret);
if (err)
return vmf_error(err);
+ if (folio)
+ vmf->page = folio_file_page(folio, vmf->pgoff);
return ret;
}
@@ -2269,7 +2280,8 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
int ret;
ret = seal_check_future_write(info->seals, vma);
@@ -2280,7 +2292,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_MTE_ALLOWED;
file_accessed(file);
- vma->vm_ops = &shmem_vm_ops;
+ /* This is anonymous shared memory if it is unlinked at the time of mmap */
+ if (inode->i_nlink)
+ vma->vm_ops = &shmem_vm_ops;
+ else
+ vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
@@ -2330,7 +2346,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
@@ -2398,7 +2414,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
void *page_kaddr;
struct folio *folio;
- struct page *page;
int ret;
pgoff_t max_off;
@@ -2417,53 +2432,70 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!*pagep) {
ret = -ENOMEM;
- page = shmem_alloc_page(gfp, info, pgoff);
- if (!page)
+ folio = shmem_alloc_folio(gfp, info, pgoff);
+ if (!folio)
goto out_unacct_blocks;
if (!zeropage) { /* COPY */
- page_kaddr = kmap_atomic(page);
+ page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- *pagep = page;
+ *pagep = &folio->page;
ret = -ENOENT;
/* don't free the page */
goto out_unacct_blocks;
}
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
} else { /* ZEROPAGE */
- clear_user_highpage(page, dst_addr);
+ clear_user_highpage(&folio->page, dst_addr);
}
} else {
- page = *pagep;
+ folio = page_folio(*pagep);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
*pagep = NULL;
}
- VM_BUG_ON(PageLocked(page));
- VM_BUG_ON(PageSwapBacked(page));
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- __SetPageUptodate(page);
+ VM_BUG_ON(folio_test_locked(folio));
+ VM_BUG_ON(folio_test_swapbacked(folio));
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __folio_mark_uptodate(folio);
ret = -EFAULT;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(pgoff >= max_off))
goto out_release;
- folio = page_folio(page);
ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, true, wp_copy);
+ &folio->page, true, wp_copy);
if (ret)
goto out_delete_from_cache;
@@ -2473,13 +2505,13 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
out_delete_from_cache:
- delete_from_page_cache(page);
+ filemap_remove_folio(folio);
out_release:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
return ret;
@@ -2498,6 +2530,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ struct folio *folio;
int ret = 0;
/* i_rwsem is held by caller */
@@ -2509,14 +2542,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
if (ret)
return ret;
+ *pagep = folio_file_page(folio, index);
if (PageHWPoison(*pagep)) {
- unlock_page(*pagep);
- put_page(*pagep);
+ folio_unlock(folio);
+ folio_put(folio);
*pagep = NULL;
return -EIO;
}
@@ -2575,6 +2609,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
offset = *ppos & ~PAGE_MASK;
for (;;) {
+ struct folio *folio = NULL;
struct page *page = NULL;
pgoff_t end_index;
unsigned long nr, ret;
@@ -2589,17 +2624,18 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, SGP_READ);
+ error = shmem_get_folio(inode, index, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page) {
- unlock_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ page = folio_file_page(folio, index);
if (PageHWPoison(page)) {
- put_page(page);
+ folio_put(folio);
error = -EIO;
break;
}
@@ -2615,14 +2651,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (index == end_index) {
nr = i_size & ~PAGE_MASK;
if (nr <= offset) {
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
break;
}
}
nr -= offset;
- if (page) {
+ if (folio) {
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
@@ -2634,13 +2670,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* Mark the page accessed if we read the beginning.
*/
if (!offset)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
ret = copy_page_to_iter(page, offset, nr, to);
- put_page(page);
+ folio_put(folio);
} else if (user_backed_iter(to)) {
/*
@@ -2783,7 +2819,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
info->fallocend = end;
for (index = start; index < end; ) {
- struct page *page;
+ struct folio *folio;
/*
* Good, the fallocate(2) manpage permits EINTR: we may have
@@ -2794,10 +2830,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC);
+ error = shmem_get_folio(inode, index, &folio,
+ SGP_FALLOC);
if (error) {
info->fallocend = undo_fallocend;
- /* Remove the !PageUptodate pages we added */
+ /* Remove the !uptodate folios we added */
if (index > start) {
shmem_undo_range(inode,
(loff_t)start << PAGE_SHIFT,
@@ -2806,37 +2843,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto undone;
}
- index++;
/*
* Here is a more important optimization than it appears:
- * a second SGP_FALLOC on the same huge page will clear it,
- * making it PageUptodate and un-undoable if we fail later.
+ * a second SGP_FALLOC on the same large folio will clear it,
+ * making it uptodate and un-undoable if we fail later.
*/
- if (PageTransCompound(page)) {
- index = round_up(index, HPAGE_PMD_NR);
- /* Beware 32-bit wraparound */
- if (!index)
- index--;
- }
+ index = folio_next_index(folio);
+ /* Beware 32-bit wraparound */
+ if (!index)
+ index--;
/*
* Inform shmem_writepage() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
- if (!PageUptodate(page))
+ if (!folio_test_uptodate(folio))
shmem_falloc.nr_falloced += index - shmem_falloc.next;
shmem_falloc.next = index;
/*
- * If !PageUptodate, leave it that way so that freeable pages
+ * If !uptodate, leave it that way so that freeable folios
* can be recognized if we need to rollback on error later.
- * But set_page_dirty so that memory pressure will swap rather
- * than free the pages we are allocating (and SGP_CACHE pages
+ * But mark it dirty so that memory pressure will swap rather
+ * than free the folios we are allocating (and SGP_CACHE folios
* might still be clean: we now need to mark those dirty too).
*/
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
cond_resched();
}
@@ -2901,6 +2935,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
@@ -2912,7 +2947,7 @@ out_iput:
static int
shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
@@ -2927,9 +2962,9 @@ shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
error = simple_acl_create(dir, inode);
if (error)
goto out_iput;
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
}
- return error;
+ return finish_open_simple(file, error);
out_iput:
iput(inode);
return error;
@@ -2976,6 +3011,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
@@ -2993,6 +3029,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
return 0;
@@ -3082,6 +3119,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
inode->i_ctime = current_time(old_dir);
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
return 0;
}
@@ -3091,7 +3130,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
int error;
int len;
struct inode *inode;
- struct page *page;
+ struct folio *folio;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3119,21 +3158,22 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+ error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error) {
iput(inode);
return error;
}
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- memcpy(page_address(page), symname, len);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ memcpy(folio_address(folio), symname, len);
+ folio_mark_uptodate(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
@@ -3141,40 +3181,41 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
static void shmem_put_link(void *arg)
{
- mark_page_accessed(arg);
- put_page(arg);
+ folio_mark_accessed(arg);
+ folio_put(arg);
}
static const char *shmem_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
int error;
+
if (!dentry) {
- page = find_get_page(inode->i_mapping, 0);
- if (!page)
+ folio = filemap_get_folio(inode->i_mapping, 0);
+ if (!folio)
return ERR_PTR(-ECHILD);
- if (PageHWPoison(page) ||
- !PageUptodate(page)) {
- put_page(page);
+ if (PageHWPoison(folio_page(folio, 0)) ||
+ !folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ);
+ error = shmem_get_folio(inode, 0, &folio, SGP_READ);
if (error)
return ERR_PTR(error);
- if (!page)
+ if (!folio)
return ERR_PTR(-ECHILD);
- if (PageHWPoison(page)) {
- unlock_page(page);
- put_page(page);
+ if (PageHWPoison(folio_page(folio, 0))) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- set_delayed_call(done, shmem_put_link, page);
- return page_address(page);
+ set_delayed_call(done, shmem_put_link, folio);
+ return folio_address(folio);
}
#ifdef CONFIG_TMPFS_XATTR
@@ -3204,6 +3245,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns,
shmem_set_inode_flags(inode, info->fsflags);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
return 0;
}
@@ -3244,7 +3286,7 @@ static int shmem_initxattrs(struct inode *inode,
memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
xattr->name, len);
- simple_xattr_list_add(&info->xattrs, new_xattr);
+ simple_xattr_add(&info->xattrs, new_xattr);
}
return 0;
@@ -3267,9 +3309,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int err;
name = xattr_full_name(handler, name);
- return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ if (!err) {
+ inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
+ }
+ return err;
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3732,7 +3780,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC;
+ sb->s_flags |= SB_NOSEC | SB_I_VERSION;
#else
sb->s_flags |= SB_NOUSER;
#endif
@@ -3876,6 +3924,7 @@ EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
+ .open = generic_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
@@ -3961,6 +4010,15 @@ static const struct vm_operations_struct shmem_vm_ops = {
#endif
};
+static const struct vm_operations_struct shmem_anon_vm_ops = {
+ .fault = shmem_fault,
+ .map_pages = filemap_map_pages,
+#ifdef CONFIG_NUMA
+ .set_policy = shmem_set_policy,
+ .get_policy = shmem_get_policy,
+#endif
+};
+
int shmem_init_fs_context(struct fs_context *fc)
{
struct shmem_options *ctx;
@@ -4136,6 +4194,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
+#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
@@ -4241,7 +4300,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
@@ -4266,18 +4325,20 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
{
#ifdef CONFIG_SHMEM
struct inode *inode = mapping->host;
+ struct folio *folio;
struct page *page;
int error;
BUG_ON(!shmem_mapping(mapping));
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
return ERR_PTR(error);
- unlock_page(page);
+ folio_unlock(folio);
+ page = folio_file_page(folio, index);
if (PageHWPoison(page)) {
- put_page(page);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
diff --git a/mm/shuffle.c b/mm/shuffle.c
index c13c33b247e8..fb1393b8b3a9 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -12,23 +12,22 @@
DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
static bool shuffle_param;
-static int shuffle_show(char *buffer, const struct kernel_param *kp)
-{
- return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
-}
-static __meminit int shuffle_store(const char *val,
+static __meminit int shuffle_param_set(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
-
- if (rc < 0)
- return rc;
- if (shuffle_param)
+ if (param_set_bool(val, kp))
+ return -EINVAL;
+ if (*(bool *)kp->arg)
static_branch_enable(&page_alloc_shuffle_key);
return 0;
}
-module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+static const struct kernel_param_ops shuffle_param_ops = {
+ .set = shuffle_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400);
/*
* For two pages to be swapped in the shuffle, they must be free (on a
diff --git a/mm/slab.c b/mm/slab.c
index 10e96137b44f..7a269db050ee 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
- spin_lock_init(&parent->list_lock);
+ raw_spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
}
@@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
slab_node = slab_nid(slab);
n = get_node(cachep, slab_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, slab_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -684,7 +684,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
struct kmem_cache_node *n = get_node(cachep, node);
if (ac->avail) {
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
@@ -695,7 +695,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
free_block(cachep, ac->entry, ac->avail, node, list);
ac->avail = 0;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
}
}
@@ -768,9 +768,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
slabs_destroy(cachep, &list);
} else {
n = get_node(cachep, slab_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, slab_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
@@ -811,10 +811,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
*/
n = get_node(cachep, node);
if (n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
cachep->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
return 0;
}
@@ -893,7 +893,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
goto fail;
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
if (n->shared && force_change) {
free_block(cachep, n->shared->entry,
n->shared->avail, node, &list);
@@ -911,7 +911,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
new_alien = NULL;
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
/*
@@ -950,7 +950,7 @@ static void cpuup_canceled(long cpu)
if (!n)
continue;
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
/* Free limit for this kmem_cache_node */
n->free_limit -= cachep->batchcount;
@@ -961,7 +961,7 @@ static void cpuup_canceled(long cpu)
nc->avail = 0;
if (!cpumask_empty(mask)) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto free_slab;
}
@@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu)
alien = n->alien;
n->alien = NULL;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
kfree(shared);
if (alien) {
@@ -1159,7 +1159,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
- spin_lock_init(&ptr->list_lock);
+ raw_spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->node[nodeid] = ptr;
@@ -1330,11 +1330,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
for_each_kmem_cache_node(cachep, node, n) {
unsigned long total_slabs, free_slabs, free_objs;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
total_slabs = n->total_slabs;
free_slabs = n->free_slabs;
free_objs = n->free_objects;
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
node, total_slabs - free_slabs, total_slabs,
@@ -1370,6 +1370,8 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
account_slab(slab, cachep->gfporder, cachep, flags);
__folio_set_slab(folio);
+ /* Make the flag visible before any changes to folio->mapping */
+ smp_wmb();
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0)))
slab_set_pfmemalloc(slab);
@@ -1387,9 +1389,11 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
BUG_ON(!folio_test_slab(folio));
__slab_clear_pfmemalloc(slab);
- __folio_clear_slab(folio);
page_mapcount_reset(folio_page(folio, 0));
folio->mapping = NULL;
+ /* Make the mapping reset visible before clearing the flag */
+ smp_wmb();
+ __folio_clear_slab(folio);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
@@ -1619,7 +1623,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
* although actual page can be freed in rcu context
*/
if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->freelist_cache, freelist);
+ kfree(freelist);
}
/*
@@ -1671,21 +1675,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (flags & CFLGS_OFF_SLAB) {
struct kmem_cache *freelist_cache;
size_t freelist_size;
+ size_t freelist_cache_size;
freelist_size = num * sizeof(freelist_idx_t);
- freelist_cache = kmalloc_slab(freelist_size, 0u);
- if (!freelist_cache)
- continue;
-
- /*
- * Needed to avoid possible looping condition
- * in cache_grow_begin()
- */
- if (OFF_SLAB(freelist_cache))
- continue;
+ if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
+ freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
+ } else {
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+ freelist_cache_size = freelist_cache->size;
+
+ /*
+ * Needed to avoid possible looping condition
+ * in cache_grow_begin()
+ */
+ if (OFF_SLAB(freelist_cache))
+ continue;
+ }
/* check if off slab has enough benefit */
- if (freelist_cache->size > cachep->size / 2)
+ if (freelist_cache_size > cachep->size / 2)
continue;
}
@@ -2061,11 +2071,6 @@ done:
cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
- if (OFF_SLAB(cachep)) {
- cachep->freelist_cache =
- kmalloc_slab(cachep->freelist_size, 0u);
- }
-
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_release(cachep);
@@ -2095,7 +2100,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
#endif
}
@@ -2103,7 +2108,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, node)->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
#endif
}
@@ -2143,9 +2148,9 @@ static void do_drain(void *arg)
check_irq_off();
ac = cpu_cache_get(cachep);
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
ac->avail = 0;
slabs_destroy(cachep, &list);
}
@@ -2163,9 +2168,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
drain_alien_cache(cachep, n->alien);
for_each_kmem_cache_node(cachep, node, n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, n->shared, node, true, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -2187,10 +2192,10 @@ static int drain_freelist(struct kmem_cache *cache,
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
p = n->slabs_free.prev;
if (p == &n->slabs_free) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto out;
}
@@ -2203,7 +2208,7 @@ static int drain_freelist(struct kmem_cache *cache,
* to the cache.
*/
n->free_objects -= cache->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slab_destroy(cache, slab);
nr_freed++;
}
@@ -2292,7 +2297,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
freelist = NULL;
else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ freelist = kmalloc_node(cachep->freelist_size,
local_flags, nodeid);
} else {
/* We will use last bytes at the slab for freelist */
@@ -2380,7 +2385,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
unsigned int rand;
/* Use best entropy available to define a random shift */
- rand = get_random_int();
+ rand = get_random_u32();
/* Use a random state if the pre-computed list is not available */
if (!cachep->random_seq) {
@@ -2628,7 +2633,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
INIT_LIST_HEAD(&slab->slab_list);
n = get_node(cachep, slab_nid(slab));
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
n->total_slabs++;
if (!slab->active) {
list_add_tail(&slab->slab_list, &n->slabs_free);
@@ -2638,7 +2643,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - slab->active;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
}
@@ -2804,7 +2809,7 @@ static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct slab *slab;
- assert_spin_locked(&n->list_lock);
+ assert_raw_spin_locked(&n->list_lock);
slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
slab_list);
if (!slab) {
@@ -2831,10 +2836,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
if (!gfp_pfmemalloc_allowed(flags))
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
slab = get_first_slab(n, true);
if (!slab) {
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return NULL;
}
@@ -2843,7 +2848,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
fixup_slab_list(cachep, n, slab, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
@@ -2902,7 +2907,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
@@ -2926,7 +2931,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
must_grow:
n->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
@@ -3146,7 +3151,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
BUG_ON(!n);
check_irq_off();
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
slab = get_first_slab(n, false);
if (!slab)
goto must_grow;
@@ -3164,12 +3169,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
fixup_slab_list(cachep, n, slab, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
must_grow:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
if (slab) {
/* This slab isn't counted yet so don't update free_objects */
@@ -3181,84 +3186,46 @@ must_grow:
}
static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
- unsigned long caller)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- unsigned long save_flags;
- void *ptr;
+ void *objp = NULL;
int slab_node = numa_mem_id();
- struct obj_cgroup *objcg = NULL;
- bool init = false;
-
- flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags);
- if (unlikely(!cachep))
- return NULL;
-
- ptr = kfence_alloc(cachep, orig_size, flags);
- if (unlikely(ptr))
- goto out_hooks;
- local_irq_save(save_flags);
-
- if (nodeid == NUMA_NO_NODE)
- nodeid = slab_node;
-
- if (unlikely(!get_node(cachep, nodeid))) {
- /* Node not bootstrapped yet */
- ptr = fallback_alloc(cachep, flags);
- goto out;
- }
-
- if (nodeid == slab_node) {
+ if (nodeid == NUMA_NO_NODE) {
+ if (current->mempolicy || cpuset_do_slab_mem_spread()) {
+ objp = alternate_node_alloc(cachep, flags);
+ if (objp)
+ goto out;
+ }
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
* to other nodes. It may fail while we still have
* objects on other nodes available.
*/
- ptr = ____cache_alloc(cachep, flags);
- if (ptr)
- goto out;
- }
- /* ___cache_alloc_node can fall back to other nodes */
- ptr = ____cache_alloc_node(cachep, flags, nodeid);
-out:
- local_irq_restore(save_flags);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- init = slab_want_init_on_alloc(flags, cachep);
-
-out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
- return ptr;
-}
-
-static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *objp;
-
- if (current->mempolicy || cpuset_do_slab_mem_spread()) {
- objp = alternate_node_alloc(cache, flags);
- if (objp)
- goto out;
+ objp = ____cache_alloc(cachep, flags);
+ nodeid = slab_node;
+ } else if (nodeid == slab_node) {
+ objp = ____cache_alloc(cachep, flags);
+ } else if (!get_node(cachep, nodeid)) {
+ /* Node not bootstrapped yet */
+ objp = fallback_alloc(cachep, flags);
+ goto out;
}
- objp = ____cache_alloc(cache, flags);
/*
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_mem_id());
-
+ objp = ____cache_alloc_node(cachep, flags, nodeid);
out:
return objp;
}
#else
static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused)
{
return ____cache_alloc(cachep, flags);
}
@@ -3266,8 +3233,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
- size_t orig_size, unsigned long caller)
+slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ int nodeid, size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3284,17 +3251,26 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
goto out;
local_irq_save(save_flags);
- objp = __do_cache_alloc(cachep, flags);
+ objp = __do_cache_alloc(cachep, flags, nodeid);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
init = slab_want_init_on_alloc(flags, cachep);
out:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init,
+ cachep->object_size);
return objp;
}
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ size_t orig_size, unsigned long caller)
+{
+ return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size,
+ caller);
+}
+
/*
* Caller needs to acquire correct kmem_cache_node's list_lock
* @list: List of detached free slabs should be freed by caller
@@ -3354,7 +3330,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
check_irq_off();
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
@@ -3383,7 +3359,7 @@ free_done:
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
slabs_destroy(cachep, &list);
@@ -3470,22 +3446,11 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
{
void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret, cachep,
- cachep->object_size, cachep->size, flags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE);
return ret;
}
-/**
- * kmem_cache_alloc - Allocate an object
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache. The flags are only relevant
- * if the cache has no available objects.
- *
- * Return: pointer to the new object or %NULL in case of error
- */
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return __kmem_cache_alloc_lru(cachep, NULL, flags);
@@ -3521,7 +3486,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_irq_disable();
for (i = 0; i < size; i++) {
- void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
+ void *objp = kfence_alloc(s, s->object_size, flags) ?:
+ __do_cache_alloc(s, flags, NUMA_NO_NODE);
if (unlikely(!objp))
goto error;
@@ -3536,35 +3502,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* Done outside of the IRQ disabled section.
*/
slab_post_alloc_hook(s, objcg, flags, size, p,
- slab_want_init_on_alloc(flags, s));
+ slab_want_init_on_alloc(flags, s), s->object_size);
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, objcg, flags, i, p, false);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
kmem_cache_free_bulk(s, i, p);
return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-#ifdef CONFIG_TRACING
-void *
-kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
-{
- void *ret;
-
- ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret, cachep,
- size, cachep->size, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-
-#ifdef CONFIG_NUMA
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
@@ -3580,65 +3529,21 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
+ void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep,
- cachep->object_size, cachep->size,
- flags, nodeid);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid,
- size_t size)
+void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, size_t orig_size,
+ unsigned long caller)
{
- void *ret;
-
- ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc_node(_RET_IP_, ret, cachep,
- size, cachep->size,
- flags, nodeid);
- return ret;
+ return slab_alloc_node(cachep, NULL, flags, nodeid,
+ orig_size, caller);
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-
-static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
-{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- ret = kasan_kmalloc(cachep, ret, size, flags);
-
- return ret;
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __do_kmalloc_node(size, flags, node, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, unsigned long caller)
-{
- return __do_kmalloc_node(size, flags, node, caller);
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif /* CONFIG_NUMA */
#ifdef CONFIG_PRINTK
void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
@@ -3662,45 +3567,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
}
#endif
-/**
- * __do_kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @caller: function caller for debug tracking of the caller
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- unsigned long caller)
+static __always_inline
+void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = slab_alloc(cachep, NULL, flags, size, caller);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret, cachep,
- size, cachep->size, flags);
-
- return ret;
-}
+ unsigned long flags;
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, _RET_IP_);
+ local_irq_save(flags);
+ debug_check_no_locks_freed(objp, cachep->object_size);
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, cachep->object_size);
+ __cache_free(cachep, objp, caller);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+void __kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- return __do_kmalloc(size, flags, caller);
+ __do_kmem_cache_free(cachep, objp, caller);
}
-EXPORT_SYMBOL(__kmalloc_track_caller);
/**
* kmem_cache_free - Deallocate an object
@@ -3712,34 +3597,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller);
*/
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
- unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
- trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
- local_irq_save(flags);
- debug_check_no_locks_freed(objp, cachep->object_size);
- if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, _RET_IP_);
- local_irq_restore(flags);
+ trace_kmem_cache_free(_RET_IP_, objp, cachep);
+ __do_kmem_cache_free(cachep, objp, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
- struct kmem_cache *s;
- size_t i;
local_irq_disable();
- for (i = 0; i < size; i++) {
+ for (int i = 0; i < size; i++) {
void *objp = p[i];
+ struct kmem_cache *s;
- if (!orig_s) /* called via kfree_bulk */
- s = virt_to_cache(objp);
- else
+ if (!orig_s) {
+ struct folio *folio = virt_to_folio(objp);
+
+ /* called via kfree_bulk */
+ if (!folio_test_slab(folio)) {
+ local_irq_enable();
+ free_large_kmalloc(folio, objp);
+ local_irq_disable();
+ continue;
+ }
+ s = folio_slab(folio)->slab_cache;
+ } else {
s = cache_from_obj(orig_s, objp);
+ }
+
if (!s)
continue;
@@ -3755,39 +3644,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/**
- * kfree - free previously allocated memory
- * @objp: pointer returned by kmalloc.
- *
- * If @objp is NULL, no operation is performed.
- *
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
- */
-void kfree(const void *objp)
-{
- struct kmem_cache *c;
- unsigned long flags;
-
- trace_kfree(_RET_IP_, objp);
-
- if (unlikely(ZERO_OR_NULL_PTR(objp)))
- return;
- local_irq_save(flags);
- kfree_debugcheck(objp);
- c = virt_to_cache(objp);
- if (!c) {
- local_irq_restore(flags);
- return;
- }
- debug_check_no_locks_freed(objp, c->object_size);
-
- debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, _RET_IP_);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(kfree);
-
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
@@ -3860,9 +3716,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
node = cpu_to_mem(cpu);
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
free_percpu(prev);
@@ -3954,9 +3810,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
return;
}
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, ac, node, false, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -4040,7 +3896,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
total_slabs += n->total_slabs;
free_slabs += n->free_slabs;
@@ -4049,7 +3905,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
if (n->shared)
shared_avail += n->shared->avail;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
}
num_objs = total_slabs * cachep->num;
active_slabs = total_slabs - free_slabs;
@@ -4190,28 +4046,3 @@ void __check_heap_object(const void *ptr, unsigned long n,
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
-
-/**
- * __ksize -- Uninstrumented ksize.
- * @objp: pointer to the object
- *
- * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
- * safety checks as ksize() with KASAN instrumentation enabled.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
-size_t __ksize(const void *objp)
-{
- struct kmem_cache *c;
- size_t size;
-
- BUG_ON(!objp);
- if (unlikely(objp == ZERO_SIZE_PTR))
- return 0;
-
- c = virt_to_cache(objp);
- size = c ? c->object_size : 0;
-
- return size;
-}
-EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 4ec82bec15ec..7cc432969945 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -11,37 +11,43 @@ struct slab {
#if defined(CONFIG_SLAB)
+ struct kmem_cache *slab_cache;
union {
- struct list_head slab_list;
+ struct {
+ struct list_head slab_list;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ };
struct rcu_head rcu_head;
};
- struct kmem_cache *slab_cache;
- void *freelist; /* array of free object indexes */
- void *s_mem; /* first object */
unsigned int active;
#elif defined(CONFIG_SLUB)
- union {
- struct list_head slab_list;
- struct rcu_head rcu_head;
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct {
- struct slab *next;
- int slabs; /* Nr of slabs left */
- };
-#endif
- };
struct kmem_cache *slab_cache;
- /* Double-word boundary */
- void *freelist; /* first free object */
union {
- unsigned long counters;
struct {
- unsigned inuse:16;
- unsigned objects:15;
- unsigned frozen:1;
+ union {
+ struct list_head slab_list;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ /* Double-word boundary */
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
};
+ struct rcu_head rcu_head;
};
unsigned int __unused;
@@ -66,9 +72,10 @@ struct slab {
#define SLAB_MATCH(pg, sl) \
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
SLAB_MATCH(flags, __page_flags);
-SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
#ifndef CONFIG_SLOB
-SLAB_MATCH(rcu_head, rcu_head);
+SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
+#else
+SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
#endif
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
@@ -76,6 +83,9 @@ SLAB_MATCH(memcg_data, memcg_data);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *)));
+#endif
/**
* folio_slab - Converts from folio to slab.
@@ -207,8 +217,6 @@ struct kmem_cache {
unsigned int size; /* The aligned/padded/added on size */
unsigned int align; /* Alignment as calculated */
slab_flags_t flags; /* Active flags on the slab */
- unsigned int useroffset;/* Usercopy region offset */
- unsigned int usersize; /* Usercopy region size */
const char *name; /* Slab name for sysfs */
int refcount; /* Use counter */
void (*ctor)(void *); /* Called on object slot creation */
@@ -273,6 +281,11 @@ void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller);
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller);
#endif
gfp_t kmalloc_fix_flags(gfp_t flags);
@@ -331,7 +344,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
SLAB_ACCOUNT)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS)
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC)
#else
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE)
#endif
@@ -351,6 +365,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
SLAB_ACCOUNT | \
+ SLAB_KMALLOC | \
SLAB_NO_USER_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
@@ -658,8 +673,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
+
+void free_large_kmalloc(struct folio *folio, void *object);
+
#endif /* CONFIG_SLOB */
+size_t __ksize(const void *objp);
+
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifndef CONFIG_SLUB
@@ -710,13 +730,27 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
static inline void slab_post_alloc_hook(struct kmem_cache *s,
struct obj_cgroup *objcg, gfp_t flags,
- size_t size, void **p, bool init)
+ size_t size, void **p, bool init,
+ unsigned int orig_size)
{
+ unsigned int zero_size = s->object_size;
size_t i;
flags &= gfp_allowed_mask;
/*
+ * For kmalloc object, the allocated memory size(object_size) is likely
+ * larger than the requested size(orig_size). If redzone check is
+ * enabled for the extra space, don't zero it, as it will be redzoned
+ * soon. The redzone operation for this extra space could be seen as a
+ * replacement of current poisoning under certain debug option, and
+ * won't break other sanity checks.
+ */
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
+ (s->flags & SLAB_KMALLOC))
+ zero_size = orig_size;
+
+ /*
* As memory initialization might be integrated into KASAN,
* kasan_slab_alloc and initialization memset must be
* kept together to avoid discrepancies in behavior.
@@ -726,9 +760,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
for (i = 0; i < size; i++) {
p[i] = kasan_slab_alloc(s, p[i], flags, init);
if (p[i] && init && !kasan_has_integrated_init())
- memset(p[i], 0, s->object_size);
+ memset(p[i], 0, zero_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
+ kmsan_slab_alloc(s, p[i], flags);
}
memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
@@ -739,9 +774,8 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
* The slab lists for all objects.
*/
struct kmem_cache_node {
- spinlock_t list_lock;
-
#ifdef CONFIG_SLAB
+ raw_spinlock_t list_lock;
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
@@ -757,6 +791,7 @@ struct kmem_cache_node {
#endif
#ifdef CONFIG_SLUB
+ spinlock_t list_lock;
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
@@ -860,4 +895,8 @@ void __check_heap_object(const void *ptr, unsigned long n,
}
#endif
+#ifdef CONFIG_SLUB_DEBUG
+void skip_orig_size_check(struct kmem_cache *s, const void *object);
+#endif
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 07b948288f84..1cba98acc486 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -143,8 +143,10 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->ctor)
return 1;
+#ifdef CONFIG_HARDENED_USERCOPY
if (s->usersize)
return 1;
+#endif
/*
* We may have set a slab to be unmergeable during bootstrap.
@@ -223,8 +225,10 @@ static struct kmem_cache *create_cache(const char *name,
s->size = s->object_size = object_size;
s->align = align;
s->ctor = ctor;
+#ifdef CONFIG_HARDENED_USERCOPY
s->useroffset = useroffset;
s->usersize = usersize;
+#endif
err = __kmem_cache_create(s, flags);
if (err)
@@ -317,7 +321,8 @@ kmem_cache_create_usercopy(const char *name,
flags &= CACHE_CREATE_MASK;
/* Fail closed on bad usersize of useroffset values. */
- if (WARN_ON(!usersize && useroffset) ||
+ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
+ WARN_ON(!usersize && useroffset) ||
WARN_ON(size < usersize || size - usersize < useroffset))
usersize = useroffset = 0;
@@ -475,6 +480,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
int refcnt;
+ bool rcu_set;
if (unlikely(!s) || !kasan_check_byte(s))
return;
@@ -482,6 +488,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
cpus_read_lock();
mutex_lock(&slab_mutex);
+ rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
refcnt = --s->refcount;
if (refcnt)
goto out_unlock;
@@ -492,7 +500,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
- if (!refcnt && !(s->flags & SLAB_TYPESAFE_BY_RCU))
+ if (!refcnt && !rcu_set)
kmem_cache_release(s);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -508,13 +516,9 @@ EXPORT_SYMBOL(kmem_cache_destroy);
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
- int ret;
-
-
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep);
- return ret;
+ return __kmem_cache_shrink(cachep);
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -596,8 +600,8 @@ void kmem_dump_obj(void *object)
ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
pr_cont(" pointer offset %lu", ptroffset);
}
- if (kp.kp_slab_cache && kp.kp_slab_cache->usersize)
- pr_cont(" size %u", kp.kp_slab_cache->usersize);
+ if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
+ pr_cont(" size %u", kp.kp_slab_cache->object_size);
if (kp.kp_ret)
pr_cont(" allocated at %pS\n", kp.kp_ret);
else
@@ -641,8 +645,10 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
align = max(align, size);
s->align = calculate_alignment(flags, align, size);
+#ifdef CONFIG_HARDENED_USERCOPY
s->useroffset = useroffset;
s->usersize = usersize;
+#endif
err = __kmem_cache_create(s, flags);
@@ -662,7 +668,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags, useroffset, usersize);
+ create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
+ usersize);
kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
@@ -734,6 +741,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
return kmalloc_caches[kmalloc_type(flags)][index];
}
+size_t kmalloc_size_roundup(size_t size)
+{
+ struct kmem_cache *c;
+
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+ /* Above the smaller buckets, size is a multiple of page size. */
+ if (size > KMALLOC_MAX_CACHE_SIZE)
+ return PAGE_SIZE << get_order(size);
+
+ /* The flags don't matter since size_index is common to all. */
+ c = kmalloc_slab(size, GFP_KERNEL);
+ return c ? c->object_size : 0;
+}
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
#ifdef CONFIG_ZONE_DMA
#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
#else
@@ -746,10 +773,16 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
#define KMALLOC_CGROUP_NAME(sz)
#endif
+#ifndef CONFIG_SLUB_TINY
+#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
+#else
+#define KMALLOC_RCL_NAME(sz)
+#endif
+
#define INIT_KMALLOC_INFO(__size, __short_size) \
{ \
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
- .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ KMALLOC_RCL_NAME(__short_size) \
KMALLOC_CGROUP_NAME(__short_size) \
KMALLOC_DMA_NAME(__short_size) \
.size = __size, \
@@ -757,8 +790,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
- * kmalloc-32M.
+ * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
+ * kmalloc-2M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -782,11 +815,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
- INIT_KMALLOC_INFO(2097152, 2M),
- INIT_KMALLOC_INFO(4194304, 4M),
- INIT_KMALLOC_INFO(8388608, 8M),
- INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M)
+ INIT_KMALLOC_INFO(2097152, 2M)
};
/*
@@ -839,7 +868,7 @@ void __init setup_kmalloc_cache_index_table(void)
static void __init
new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
- if (type == KMALLOC_RECLAIM) {
+ if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
flags |= SLAB_RECLAIM_ACCOUNT;
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
if (mem_cgroup_kmem_disabled()) {
@@ -899,6 +928,158 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
}
+
+void free_large_kmalloc(struct folio *folio, void *object)
+{
+ unsigned int order = folio_order(folio);
+
+ if (WARN_ON_ONCE(order == 0))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
+ kmemleak_free(object);
+ kasan_kfree_large(object);
+ kmsan_kfree_large(object);
+
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
+}
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node);
+static __always_inline
+void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = __kmalloc_large_node(size, flags, node);
+ trace_kmalloc(caller, ret, size,
+ PAGE_SIZE << get_order(size), flags, node);
+ return ret;
+ }
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = __kmem_cache_alloc_node(s, flags, node, size, caller);
+ ret = kasan_kmalloc(s, ret, size, flags);
+ trace_kmalloc(caller, ret, size, s->size, flags, node);
+ return ret;
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+
+/**
+ * kfree - free previously allocated memory
+ * @object: pointer returned by kmalloc.
+ *
+ * If @object is NULL, no operation is performed.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *object)
+{
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+
+ trace_kfree(_RET_IP_, object);
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ folio = virt_to_folio(object);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, (void *)object);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ __kmem_cache_free(s, (void *)object, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+/**
+ * __ksize -- Report full size of underlying allocation
+ * @object: pointer to the object
+ *
+ * This should only be used internally to query the true size of allocations.
+ * It is not meant to be a way to discover the usable size of an allocation
+ * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
+ * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
+ * and/or FORTIFY_SOURCE.
+ *
+ * Return: size of the actual memory used by @object in bytes
+ */
+size_t __ksize(const void *object)
+{
+ struct folio *folio;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ folio = virt_to_folio(object);
+
+ if (unlikely(!folio_test_slab(folio))) {
+ if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
+ return 0;
+ if (WARN_ON(object != folio_address(folio)))
+ return 0;
+ return folio_size(folio);
+ }
+
+#ifdef CONFIG_SLUB_DEBUG
+ skip_orig_size_check(folio_slab(folio)->slab_cache, object);
+#endif
+
+ return slab_ksize(folio_slab(folio)->slab_cache);
+}
+
+void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
+ size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_trace);
+
+void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_node_trace);
#endif /* !CONFIG_SLOB */
gfp_t kmalloc_fix_flags(gfp_t flags)
@@ -918,37 +1099,51 @@ gfp_t kmalloc_fix_flags(gfp_t flags)
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
*/
-void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = NULL;
struct page *page;
+ void *ptr = NULL;
+ unsigned int order = get_order(size);
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- page = alloc_pages(flags, order);
- if (likely(page)) {
- ret = page_address(page);
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
+ ptr = page_address(page);
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
}
- ret = kasan_kmalloc_large(ret, size, flags);
- /* As ret might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ret, size, 1, flags);
+
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ptr, size, 1, flags);
+ kmsan_kmalloc_large(ptr, size, flags);
+
+ return ptr;
+}
+
+void *kmalloc_large(size_t size, gfp_t flags)
+{
+ void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, NUMA_NO_NODE);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order);
+EXPORT_SYMBOL(kmalloc_large);
-#ifdef CONFIG_TRACING
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags);
+ void *ret = __kmalloc_large_node(size, flags, node);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, node);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order_trace);
-#endif
+EXPORT_SYMBOL(kmalloc_large_node);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
@@ -1147,17 +1342,17 @@ module_init(slab_proc_init);
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
- gfp_t flags)
+static __always_inline __realloc_size(2) void *
+__do_krealloc(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
size_t ks;
- /* Don't use instrumented ksize to allow precise KASAN poisoning. */
+ /* Check for double-free before calling ksize. */
if (likely(!ZERO_OR_NULL_PTR(p))) {
if (!kasan_check_byte(p))
return NULL;
- ks = kfence_ksize(p) ?: __ksize(p);
+ ks = ksize(p);
} else
ks = 0;
@@ -1225,35 +1420,21 @@ void kfree_sensitive(const void *p)
void *mem = (void *)p;
ks = ksize(mem);
- if (ks)
+ if (ks) {
+ kasan_unpoison_range(mem, ks);
memzero_explicit(mem, ks);
+ }
kfree(mem);
}
EXPORT_SYMBOL(kfree_sensitive);
-/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
- *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
size_t ksize(const void *objp)
{
- size_t size;
-
/*
- * We need to first check that the pointer to the object is valid, and
- * only then unpoison the memory. The report printed from ksize() is
- * more useful, then when it's printed later when the behaviour could
- * be undefined due to a potential use-after-free or double-free.
+ * We need to first check that the pointer to the object is valid.
+ * The KASAN report printed from ksize() is more useful, then when
+ * it's printed later when the behaviour could be undefined due to
+ * a potential use-after-free or double-free.
*
* We use kasan_check_byte(), which is supported for the hardware
* tag-based KASAN mode, unlike kasan_check_read/write().
@@ -1267,21 +1448,13 @@ size_t ksize(const void *objp)
if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
return 0;
- size = kfence_ksize(objp) ?: __ksize(objp);
- /*
- * We assume that ksize callers could use whole allocated area,
- * so we need to unpoison this area.
- */
- kasan_unpoison_range(objp, size);
- return size;
+ return kfence_ksize(objp) ?: __ksize(objp);
}
EXPORT_SYMBOL(ksize);
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/slob.c b/mm/slob.c
index 2bd4f476c340..fe567fcfa3a3 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
*m = size;
ret = (void *)m + minalign;
- trace_kmalloc_node(caller, ret, NULL,
- size, size + minalign, gfp, node);
+ trace_kmalloc(caller, ret, size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- trace_kmalloc_node(caller, ret, NULL,
- size, PAGE_SIZE << order, gfp, node);
+ trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node);
}
kmemleak_alloc(ret, size, 1, gfp);
@@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
int node, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
void kfree(const void *block)
{
@@ -574,6 +564,20 @@ void kfree(const void *block)
}
EXPORT_SYMBOL(kfree);
+size_t kmalloc_size_roundup(size_t size)
+{
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+
+ return ALIGN(size, ARCH_KMALLOC_MINALIGN);
+}
+
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t __ksize(const void *block)
{
@@ -594,7 +598,6 @@ size_t __ksize(const void *block)
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
-EXPORT_SYMBOL(__ksize);
int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{
@@ -602,6 +605,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
/* leave room for rcu footer at the end of object */
c->size += sizeof(struct slob_rcu);
}
+
+ /* Actual size allocated */
+ c->size = SLOB_UNITS(c->size) * SLOB_UNIT;
c->flags = flags;
return 0;
}
@@ -616,14 +622,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
- SLOB_UNITS(c->size) * SLOB_UNIT,
- flags, node);
+ trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
- PAGE_SIZE << get_order(c->size),
- flags, node);
+ trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node);
}
if (b && c->ctor) {
@@ -647,7 +649,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_
return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);
-#ifdef CONFIG_NUMA
+
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
return __do_kmalloc_node(size, gfp, node, _RET_IP_);
@@ -659,7 +661,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
return slob_alloc_node(cachep, gfp, node);
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
static void __kmem_cache_free(void *b, int size)
{
@@ -680,7 +681,7 @@ static void kmem_rcu_free(struct rcu_head *head)
void kmem_cache_free(struct kmem_cache *c, void *b)
{
kmemleak_free_recursive(b, c->flags);
- trace_kmem_cache_free(_RET_IP_, b, c->name);
+ trace_kmem_cache_free(_RET_IP_, b, c);
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
diff --git a/mm/slub.c b/mm/slub.c
index 862dbd9af4f5..13459c69095a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,6 +22,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
+#include <linux/kmsan.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
@@ -38,6 +39,7 @@
#include <linux/memcontrol.h>
#include <linux/random.h>
#include <kunit/test.h>
+#include <kunit/test-bug.h>
#include <linux/sort.h>
#include <linux/debugfs.h>
@@ -50,7 +52,7 @@
* 1. slab_mutex (Global Mutex)
* 2. node->list_lock (Spinlock)
* 3. kmem_cache->cpu_slab->lock (Local lock)
- * 4. slab_lock(slab) (Only on some arches or for debugging)
+ * 4. slab_lock(slab) (Only on some arches)
* 5. object_map_lock (Only for debugging)
*
* slab_mutex
@@ -64,8 +66,9 @@
* The slab_lock is a wrapper around the page lock, thus it is a bit
* spinlock.
*
- * The slab_lock is only used for debugging and on arches that do not
- * have the ability to do a cmpxchg_double. It only protects:
+ * The slab_lock is only used on arches that do not have the ability
+ * to do a cmpxchg_double. It only protects:
+ *
* A. slab->freelist -> List of free objects in a slab
* B. slab->inuse -> Number of objects in use
* C. slab->objects -> Number of objects in slab
@@ -94,15 +97,20 @@
* allocating a long series of objects that fill up slabs does not require
* the list lock.
*
+ * For debug caches, all allocations are forced to go through a list_lock
+ * protected region to serialize against concurrent validation.
+ *
* cpu_slab->lock local lock
*
* This locks protect slowpath manipulation of all kmem_cache_cpu fields
* except the stat counters. This is a percpu structure manipulated only by
* the local cpu, so the lock protects against being preempted or interrupted
* by an irq. Fast path operations rely on lockless operations instead.
- * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
- * prevent the lockless operations), so fastpath operations also need to take
- * the lock and are no longer lockless.
+ *
+ * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
+ * which means the lockless fastpath cannot be used as it might interfere with
+ * an in-progress slow path operations. In this case the local lock is always
+ * taken but it still utilizes the freelist for the common operations.
*
* lockless fastpaths
*
@@ -163,8 +171,9 @@
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
*/
#ifndef CONFIG_PREEMPT_RT
-#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
-#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define USE_LOCKLESS_FAST_PATH() (true)
#else
#define slub_get_cpu_ptr(var) \
({ \
@@ -176,6 +185,13 @@ do { \
(void)(var); \
migrate_enable(); \
} while (0)
+#define USE_LOCKLESS_FAST_PATH() (false)
+#endif
+
+#ifndef CONFIG_SLUB_TINY
+#define __fastpath_inline __always_inline
+#else
+#define __fastpath_inline
#endif
#ifdef CONFIG_SLUB_DEBUG
@@ -186,11 +202,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
#endif /* CONFIG_SLUB_DEBUG */
+/* Structure holding parameters for get_partial() call chain */
+struct partial_context {
+ struct slab **slab;
+ gfp_t flags;
+ unsigned int orig_size;
+};
+
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+ return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+ (s->flags & SLAB_KMALLOC));
+}
+
void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -219,6 +248,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG
+#ifndef CONFIG_SLUB_TINY
/*
* Minimum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -231,6 +261,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
* sort the partial list by the number of objects in use.
*/
#define MAX_PARTIAL 10
+#else
+#define MIN_PARTIAL 0
+#define MAX_PARTIAL 0
+#endif
#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
@@ -276,7 +310,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
#else
@@ -310,6 +344,13 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
*/
static nodemask_t slab_nodes;
+#ifndef CONFIG_SLUB_TINY
+/*
+ * Workqueue used for flush_cpu_slab().
+ */
+static struct workqueue_struct *flushwq;
+#endif
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -354,11 +395,24 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return freelist_dereference(s, object + s->offset);
}
+#ifndef CONFIG_SLUB_TINY
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
prefetchw(object + s->offset);
}
+#endif
+/*
+ * When running under KMSAN, get_freepointer_safe() may return an uninitialized
+ * pointer value in the case the current thread loses the race for the next
+ * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
+ * slab_alloc_node() will fail, so the uninitialized value won't be used, but
+ * KMSAN will still check all arguments of cmpxchg because of imperfect
+ * handling of inline assembly.
+ * To work around this problem, we apply __no_kmsan_checks to ensure that
+ * get_freepointer_safe() returns initialized memory.
+ */
+__no_kmsan_checks
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
unsigned long freepointer_addr;
@@ -442,7 +496,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
/*
* Per slab locking using the pagelock
*/
-static __always_inline void __slab_lock(struct slab *slab)
+static __always_inline void slab_lock(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -450,7 +504,7 @@ static __always_inline void __slab_lock(struct slab *slab)
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void __slab_unlock(struct slab *slab)
+static __always_inline void slab_unlock(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -458,31 +512,19 @@ static __always_inline void __slab_unlock(struct slab *slab)
__bit_spin_unlock(PG_locked, &page->flags);
}
-static __always_inline void slab_lock(struct slab *slab, unsigned long *flags)
-{
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_save(*flags);
- __slab_lock(slab);
-}
-
-static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags)
-{
- __slab_unlock(slab);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(*flags);
-}
-
/*
* Interrupts must be disabled (for the fallback code to work right), typically
- * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
- * so we disable interrupts as part of slab_[un]lock().
+ * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
+ * part of bit_spin_lock(), is sufficient because the policy is not to allow any
+ * allocation/ free operation in hardirq context. Therefore nothing can
+ * interrupt the operation.
*/
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -494,18 +536,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
} else
#endif
{
- /* init to 0 to prevent spurious warnings */
- unsigned long flags = 0;
-
- slab_lock(slab, &flags);
+ slab_lock(slab);
if (slab->freelist == freelist_old &&
slab->counters == counters_old) {
slab->freelist = freelist_new;
slab->counters = counters_new;
- slab_unlock(slab, &flags);
+ slab_unlock(slab);
return true;
}
- slab_unlock(slab, &flags);
+ slab_unlock(slab);
}
cpu_relax();
@@ -536,16 +575,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
unsigned long flags;
local_irq_save(flags);
- __slab_lock(slab);
+ slab_lock(slab);
if (slab->freelist == freelist_old &&
slab->counters == counters_old) {
slab->freelist = freelist_new;
slab->counters = counters_new;
- __slab_unlock(slab);
+ slab_unlock(slab);
local_irq_restore(flags);
return true;
}
- __slab_unlock(slab);
+ slab_unlock(slab);
local_irq_restore(flags);
}
@@ -561,7 +600,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
#ifdef CONFIG_SLUB_DEBUG
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
-static DEFINE_RAW_SPINLOCK(object_map_lock);
+static DEFINE_SPINLOCK(object_map_lock);
static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
struct slab *slab)
@@ -580,7 +619,7 @@ static bool slab_add_kunit_errors(void)
{
struct kunit_resource *resource;
- if (likely(!current->kunit_test))
+ if (!kunit_get_current_test())
return false;
resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
@@ -595,30 +634,6 @@ static bool slab_add_kunit_errors(void)
static inline bool slab_add_kunit_errors(void) { return false; }
#endif
-/*
- * Determine a map of objects in use in a slab.
- *
- * Node listlock must be held to guarantee that the slab does
- * not vanish from under us.
- */
-static unsigned long *get_map(struct kmem_cache *s, struct slab *slab)
- __acquires(&object_map_lock)
-{
- VM_BUG_ON(!irqs_disabled());
-
- raw_spin_lock(&object_map_lock);
-
- __fill_map(object_map, s, slab);
-
- return object_map;
-}
-
-static void put_map(unsigned long *map) __releases(&object_map_lock)
-{
- VM_BUG_ON(map != object_map);
- raw_spin_unlock(&object_map_lock);
-}
-
static inline unsigned int size_from_object(struct kmem_cache *s)
{
if (s->flags & SLAB_RED_ZONE)
@@ -816,6 +831,55 @@ static void print_slab_info(const struct slab *slab)
folio_flags(folio, 0));
}
+/*
+ * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
+ * family will round up the real request size to these fixed ones, so
+ * there could be an extra area than what is requested. Save the original
+ * request size in the meta data area, for better debug and sanity check.
+ */
+static inline void set_orig_size(struct kmem_cache *s,
+ void *object, unsigned int orig_size)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return;
+
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * KASAN could save its free meta data in object's data area at
+ * offset 0, if the size is larger than 'orig_size', it will
+ * overlap the data redzone in [orig_size+1, object_size], and
+ * the check should be skipped.
+ */
+ if (kasan_metadata_size(s, true) > orig_size)
+ orig_size = s->object_size;
+#endif
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ *(unsigned int *)p = orig_size;
+}
+
+static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return s->object_size;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ return *(unsigned int *)p;
+}
+
+void skip_orig_size_check(struct kmem_cache *s, const void *object)
+{
+ set_orig_size(s, (void *)object, s->object_size);
+}
+
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
struct va_format vaf;
@@ -875,7 +939,10 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- off += kasan_metadata_size(s);
+ if (slub_debug_orig_size(s))
+ off += sizeof(unsigned int);
+
+ off += kasan_metadata_size(s, false);
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
@@ -931,17 +998,28 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = kasan_reset_tag(object);
+ unsigned int poison_size = s->object_size;
- if (s->flags & SLAB_RED_ZONE)
+ if (s->flags & SLAB_RED_ZONE) {
memset(p - s->red_left_pad, val, s->red_left_pad);
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
+ /*
+ * Redzone the extra allocated space by kmalloc than
+ * requested, and the poison size will be limited to
+ * the original request size accordingly.
+ */
+ poison_size = get_orig_size(s, object);
+ }
+ }
+
if (s->flags & __OBJECT_POISON) {
- memset(p, POISON_FREE, s->object_size - 1);
- p[s->object_size - 1] = POISON_END;
+ memset(p, POISON_FREE, poison_size - 1);
+ p[poison_size - 1] = POISON_END;
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->object_size, val, s->inuse - s->object_size);
+ memset(p + poison_size, val, s->inuse - poison_size);
}
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -1008,7 +1086,8 @@ skip_bug_print:
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at minimum
+ * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
+ * D. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -1026,11 +1105,15 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned long off = get_info_end(s); /* The end of info */
- if (s->flags & SLAB_STORE_USER)
+ if (s->flags & SLAB_STORE_USER) {
/* We also have user information there */
off += 2 * sizeof(struct track);
- off += kasan_metadata_size(s);
+ if (s->flags & SLAB_KMALLOC)
+ off += sizeof(unsigned int);
+ }
+
+ off += kasan_metadata_size(s, false);
if (size_from_object(s) == off)
return 1;
@@ -1080,6 +1163,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
{
u8 *p = object;
u8 *endobject = object + s->object_size;
+ unsigned int orig_size;
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
@@ -1089,6 +1173,17 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
+
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
+ orig_size = get_orig_size(s, object);
+
+ if (s->object_size > orig_size &&
+ !check_bytes_and_report(s, slab, object,
+ "kmalloc Redzone", p + orig_size,
+ val, s->object_size - orig_size)) {
+ return 0;
+ }
+ }
} else {
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
check_bytes_and_report(s, slab, p, "Alignment padding",
@@ -1323,21 +1418,19 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
return 1;
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab,
- void *object, unsigned long addr)
+static noinline bool alloc_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *object, int orig_size)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!alloc_consistency_checks(s, slab, object))
goto bad;
}
- /* Success perform special debug activities for allocs */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_ALLOC, addr);
+ /* Success. Perform special debug activities for allocs */
trace(s, slab, object, 1);
+ set_orig_size(s, object, orig_size);
init_object(s, object, SLUB_RED_ACTIVE);
- return 1;
+ return true;
bad:
if (folio_test_slab(slab_folio(slab))) {
@@ -1350,7 +1443,7 @@ bad:
slab->inuse = slab->objects;
slab->freelist = NULL;
}
- return 0;
+ return false;
}
static inline int free_consistency_checks(struct kmem_cache *s,
@@ -1385,63 +1478,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
return 1;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline int free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr)
-{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- void *object = head;
- int cnt = 0;
- unsigned long flags, flags2;
- int ret = 0;
- depot_stack_handle_t handle = 0;
-
- if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
-
- spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(slab, &flags2);
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!check_slab(s, slab))
- goto out;
- }
-
-next_object:
- cnt++;
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!free_consistency_checks(s, slab, object, addr))
- goto out;
- }
-
- if (s->flags & SLAB_STORE_USER)
- set_track_update(s, object, TRACK_FREE, addr, handle);
- trace(s, slab, object, 0);
- /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
- init_object(s, object, SLUB_RED_INACTIVE);
-
- /* Reached end of constructed freelist yet? */
- if (object != tail) {
- object = get_freepointer(s, object);
- goto next_object;
- }
- ret = 1;
-
-out:
- if (cnt != bulk_cnt)
- slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
- bulk_cnt, cnt);
-
- slab_unlock(slab, &flags2);
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (!ret)
- slab_fix(s, "Object at 0x%p not freed", object);
- return ret;
-}
-
/*
* Parse a block of slub_debug options. Blocks are delimited by ';'
*
@@ -1660,17 +1696,19 @@ static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
-static inline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab, void *object, unsigned long addr) { return 0; }
+static inline bool alloc_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *object, int orig_size) { return true; }
-static inline int free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr) { return 0; }
+static inline bool free_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail, int *bulk_cnt,
+ unsigned long addr, depot_stack_handle_t handle) { return true; }
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
+static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1693,35 +1731,24 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+#ifndef CONFIG_SLUB_TINY
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
void **freelist, void *nextfree)
{
return false;
}
+#endif
#endif /* CONFIG_SLUB_DEBUG */
/*
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- ptr = kasan_kmalloc_large(ptr, size, flags);
- /* As ptr might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ptr, size, 1, flags);
- return ptr;
-}
-
-static __always_inline void kfree_hook(void *x)
-{
- kmemleak_free(x);
- kasan_kfree_large(x);
-}
-
static __always_inline bool slab_free_hook(struct kmem_cache *s,
void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
+ kmsan_slab_free(s, x);
debug_check_no_locks_freed(x, s->object_size);
@@ -1830,6 +1857,8 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
slab = folio_slab(folio);
__folio_set_slab(folio);
+ /* Make the flag visible before any changes to folio->mapping */
+ smp_wmb();
if (page_is_pfmemalloc(folio_page(folio, 0)))
slab_set_pfmemalloc(slab);
@@ -1911,7 +1940,7 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
return false;
freelist_count = oo_objects(s->oo);
- pos = get_random_int() % freelist_count;
+ pos = get_random_u32_below(freelist_count);
page_limit = slab->objects * s->size;
start = fixup_red_left(s, slab_address(slab));
@@ -1976,11 +2005,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab))
- goto out;
+ return NULL;
stat(s, ORDER_FALLBACK);
}
slab->objects = oo_objects(oo);
+ slab->inuse = 0;
+ slab->frozen = 0;
account_slab(slab, oo_order(oo), s, flags);
@@ -2007,15 +2038,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- slab->inuse = slab->objects;
- slab->frozen = 1;
-
-out:
- if (!slab)
- return NULL;
-
- inc_slabs_node(s, slab_nid(slab), slab->objects);
-
return slab;
}
@@ -2036,17 +2058,11 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
int order = folio_order(folio);
int pages = 1 << order;
- if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
- void *p;
-
- slab_pad_check(s, slab);
- for_each_object(p, s, slab_address(slab), slab->objects)
- check_object(s, slab, p, SLUB_RED_INACTIVE);
- }
-
__slab_clear_pfmemalloc(slab);
- __folio_clear_slab(folio);
folio->mapping = NULL;
+ /* Make the mapping reset visible before clearing the flag */
+ smp_wmb();
+ __folio_clear_slab(folio);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
unaccount_slab(slab, order, s);
@@ -2062,9 +2078,17 @@ static void rcu_free_slab(struct rcu_head *h)
static void free_slab(struct kmem_cache *s, struct slab *slab)
{
- if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
+ void *p;
+
+ slab_pad_check(s, slab);
+ for_each_object(p, s, slab_address(slab), slab->objects)
+ check_object(s, slab, p, SLUB_RED_INACTIVE);
+ }
+
+ if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
call_rcu(&slab->rcu_head, rcu_free_slab);
- } else
+ else
__free_slab(s, slab);
}
@@ -2103,6 +2127,75 @@ static inline void remove_partial(struct kmem_cache_node *n,
}
/*
+ * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
+ * slab from the n->partial list. Remove only a single object from the slab, do
+ * the alloc_debug_processing() checks and leave the slab on the list, or move
+ * it to full list if it was the last free object.
+ */
+static void *alloc_single_from_partial(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct slab *slab, int orig_size)
+{
+ void *object;
+
+ lockdep_assert_held(&n->list_lock);
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse++;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size)) {
+ remove_partial(n, slab);
+ return NULL;
+ }
+
+ if (slab->inuse == slab->objects) {
+ remove_partial(n, slab);
+ add_full(s, n, slab);
+ }
+
+ return object;
+}
+
+/*
+ * Called only for kmem_cache_debug() caches to allocate from a freshly
+ * allocated slab. Allocate a single object instead of whole freelist
+ * and put the slab to the partial (or full) list.
+ */
+static void *alloc_single_from_new_slab(struct kmem_cache *s,
+ struct slab *slab, int orig_size)
+{
+ int nid = slab_nid(slab);
+ struct kmem_cache_node *n = get_node(s, nid);
+ unsigned long flags;
+ void *object;
+
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse = 1;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size))
+ /*
+ * It's not really expected that this would fail on a
+ * freshly allocated slab, but a concurrent memory
+ * corruption in theory could cause that.
+ */
+ return NULL;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (slab->inuse == slab->objects)
+ add_full(s, n, slab);
+ else
+ add_partial(n, slab, DEACTIVATE_TO_HEAD);
+
+ inc_slabs_node(s, nid, slab->objects);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ return object;
+}
+
+/*
* Remove slab from the partial list, freeze it and
* return the pointer to the freelist.
*
@@ -2159,7 +2252,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct slab **ret_slab, gfp_t gfpflags)
+ struct partial_context *pc)
{
struct slab *slab, *slab2;
void *object = NULL;
@@ -2179,15 +2272,23 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(slab, gfpflags))
+ if (!pfmemalloc_match(slab, pc->flags))
continue;
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+ object = alloc_single_from_partial(s, n, slab,
+ pc->orig_size);
+ if (object)
+ break;
+ continue;
+ }
+
t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;
if (!object) {
- *ret_slab = slab;
+ *pc->slab = slab;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
@@ -2211,14 +2312,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Get a slab from somewhere. Search in increasing NUMA distances.
*/
-static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct slab **ret_slab)
+static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type highest_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(pc->flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -2246,15 +2346,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed(zone, flags) &&
+ if (n && cpuset_zone_allowed(zone, pc->flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, ret_slab, flags);
+ object = get_partial_node(s, n, pc);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2275,8 +2375,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
/*
* Get a partial slab, lock it and return it.
*/
-static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct slab **ret_slab)
+static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
{
void *object;
int searchnode = node;
@@ -2284,13 +2383,15 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
+ object = get_partial_node(s, get_node(s, searchnode), pc);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, ret_slab);
+ return get_any_partial(s, pc);
}
+#ifndef CONFIG_SLUB_TINY
+
#ifdef CONFIG_PREEMPTION
/*
* Calculate the next globally unique transaction for disambiguation
@@ -2304,7 +2405,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
* different cpus.
*/
#define TID_STEP 1
-#endif
+#endif /* CONFIG_PREEMPTION */
static inline unsigned long next_tid(unsigned long tid)
{
@@ -2373,7 +2474,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *freelist)
{
- enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST };
+ enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST };
struct kmem_cache_node *n = get_node(s, slab_nid(slab));
int free_delta = 0;
enum slab_modes mode = M_NONE;
@@ -2449,14 +2550,6 @@ redo:
* acquire_slab() will see a slab that is frozen
*/
spin_lock_irqsave(&n->list_lock, flags);
- } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) {
- mode = M_FULL;
- /*
- * This also ensures that the scanning of full
- * slabs from diagnostic functions will not see
- * any frozen slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
} else {
mode = M_FULL_NOLIST;
}
@@ -2466,7 +2559,7 @@ redo:
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
- if (mode == M_PARTIAL || mode == M_FULL)
+ if (mode == M_PARTIAL)
spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
}
@@ -2480,10 +2573,6 @@ redo:
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, slab);
stat(s, FREE_SLAB);
- } else if (mode == M_FULL) {
- add_full(s, n, slab);
- spin_unlock_irqrestore(&n->list_lock, flags);
- stat(s, DEACTIVATE_FULL);
} else if (mode == M_FULL_NOLIST) {
stat(s, DEACTIVATE_FULL);
}
@@ -2730,7 +2819,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s)
INIT_WORK(&sfw->work, flush_cpu_slab);
sfw->skip = false;
sfw->s = s;
- schedule_work_on(cpu, &sfw->work);
+ queue_work_on(cpu, flushwq, &sfw->work);
}
for_each_online_cpu(cpu) {
@@ -2765,6 +2854,13 @@ static int slub_cpu_dead(unsigned int cpu)
return 0;
}
+#else /* CONFIG_SLUB_TINY */
+static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
+static inline void flush_all(struct kmem_cache *s) { }
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
+static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
+#endif /* CONFIG_SLUB_TINY */
+
/*
* Check if the objects in a per cpu structure fit numa
* locality expectations.
@@ -2788,9 +2884,67 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->total_objects);
}
+
+/* Supports checking bulk free of a constructed freelist */
+static inline bool free_debug_processing(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail, int *bulk_cnt,
+ unsigned long addr, depot_stack_handle_t handle)
+{
+ bool checks_ok = false;
+ void *object = head;
+ int cnt = 0;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, slab))
+ goto out;
+ }
+
+ if (slab->inuse < *bulk_cnt) {
+ slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
+ slab->inuse, *bulk_cnt);
+ goto out;
+ }
+
+next_object:
+
+ if (++cnt > *bulk_cnt)
+ goto out_cnt;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, slab, object, addr))
+ goto out;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track_update(s, object, TRACK_FREE, addr, handle);
+ trace(s, slab, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+ init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ checks_ok = true;
+
+out_cnt:
+ if (cnt != *bulk_cnt) {
+ slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
+ *bulk_cnt, cnt);
+ *bulk_cnt = cnt;
+ }
+
+out:
+
+ if (!checks_ok)
+ slab_fix(s, "Object at 0x%p not freed", object);
+
+ return checks_ok;
+}
#endif /* CONFIG_SLUB_DEBUG */
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
+#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct slab *))
{
@@ -2804,12 +2958,12 @@ static unsigned long count_partial(struct kmem_cache_node *n,
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
-#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
+#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
+#ifdef CONFIG_SLUB_DEBUG
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
-#ifdef CONFIG_SLUB_DEBUG
static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
int node;
@@ -2840,8 +2994,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
node, nr_slabs, nr_objs, nr_free);
}
-#endif
}
+#else /* CONFIG_SLUB_DEBUG */
+static inline void
+slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
+#endif
static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
{
@@ -2851,6 +3008,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
return true;
}
+#ifndef CONFIG_SLUB_TINY
/*
* Check the slab->freelist and either transfer the freelist to the
* per cpu freelist or deactivate the slab.
@@ -2905,11 +3063,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *freelist;
struct slab *slab;
unsigned long flags;
+ struct partial_context pc;
stat(s, ALLOC_SLOWPATH);
@@ -3023,7 +3182,10 @@ new_slab:
new_objects:
- freelist = get_partial(s, gfpflags, node, &slab);
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ freelist = get_partial(s, node, &pc);
if (freelist)
goto check_new_slab;
@@ -3036,36 +3198,53 @@ new_objects:
return NULL;
}
+ stat(s, ALLOC_SLAB);
+
+ if (kmem_cache_debug(s)) {
+ freelist = alloc_single_from_new_slab(s, slab, orig_size);
+
+ if (unlikely(!freelist))
+ goto new_objects;
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
+ }
+
/*
* No other reference to the slab yet so we can
* muck around with it freely without cmpxchg
*/
freelist = slab->freelist;
slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab->frozen = 1;
- stat(s, ALLOC_SLAB);
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
check_new_slab:
if (kmem_cache_debug(s)) {
- if (!alloc_debug_processing(s, slab, freelist, addr)) {
- /* Slab failed checks. Next slab needed */
- goto new_slab;
- } else {
- /*
- * For debug case, we don't load freelist so that all
- * allocations go through alloc_debug_processing()
- */
- goto return_single;
- }
+ /*
+ * For debug caches here we had to go through
+ * alloc_single_from_partial() so just store the tracking info
+ * and return the object
+ */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
}
- if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
*/
- goto return_single;
+ deactivate_slab(s, slab, get_freepointer(s, freelist));
+ return freelist;
+ }
retry_load_slab:
@@ -3089,11 +3268,6 @@ retry_load_slab:
c->slab = slab;
goto load_freelist;
-
-return_single:
-
- deactivate_slab(s, slab, get_freepointer(s, freelist));
- return freelist;
}
/*
@@ -3102,7 +3276,7 @@ return_single:
* pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *p;
@@ -3115,52 +3289,20 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = slub_get_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
#ifdef CONFIG_PREEMPT_COUNT
slub_put_cpu_ptr(s->cpu_slab);
#endif
return p;
}
-/*
- * If the object has been wiped upon free, make sure it's fully initialized by
- * zeroing out freelist pointer.
- */
-static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
- void *obj)
-{
- if (unlikely(slab_want_init_on_free(s)) && obj)
- memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
- 0, sizeof(void *));
-}
-
-/*
- * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
- * have the fastpath folded into their functions. So no function call
- * overhead for requests that can be satisfied on the fastpath.
- *
- * The fastpath works by first checking if the lockless freelist can be used.
- * If not then __slab_alloc is called for slow processing.
- *
- * Otherwise we can simply pick the next object from the lockless free list.
- */
-static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
+static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
- void *object;
struct kmem_cache_cpu *c;
struct slab *slab;
unsigned long tid;
- struct obj_cgroup *objcg = NULL;
- bool init = false;
-
- s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
- if (!s)
- return NULL;
-
- object = kfence_alloc(s, orig_size, gfpflags);
- if (unlikely(object))
- goto out;
+ void *object;
redo:
/*
@@ -3197,16 +3339,10 @@ redo:
object = c->freelist;
slab = c->slab;
- /*
- * We cannot use the lockless fastpath on PREEMPT_RT because if a
- * slowpath has taken the local_lock_irqsave(), it is not protected
- * against a fast path operation in an irq handler. So we need to take
- * the slow path which uses local_lock. It is still relatively fast if
- * there is a suitable cpu freelist.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+
+ if (!USE_LOCKLESS_FAST_PATH() ||
unlikely(!object || !slab || !node_match(slab, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3236,29 +3372,101 @@ redo:
stat(s, ALLOC_FASTPATH);
}
+ return object;
+}
+#else /* CONFIG_SLUB_TINY */
+static void *__slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
+{
+ struct partial_context pc;
+ struct slab *slab;
+ void *object;
+
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ object = get_partial(s, node, &pc);
+
+ if (object)
+ return object;
+
+ slab = new_slab(s, gfpflags, node);
+ if (unlikely(!slab)) {
+ slab_out_of_memory(s, gfpflags, node);
+ return NULL;
+ }
+
+ object = alloc_single_from_new_slab(s, slab, orig_size);
+
+ return object;
+}
+#endif /* CONFIG_SLUB_TINY */
+
+/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+ 0, sizeof(void *));
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
+{
+ void *object;
+ struct obj_cgroup *objcg = NULL;
+ bool init = false;
+
+ s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
+ if (!s)
+ return NULL;
+
+ object = kfence_alloc(s, orig_size, gfpflags);
+ if (unlikely(object))
+ goto out;
+
+ object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
+
maybe_wipe_obj_freeptr(s, object);
init = slab_want_init_on_alloc(gfpflags, s);
out:
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
+ /*
+ * When init equals 'true', like for kzalloc() family, only
+ * @orig_size bytes might be zeroed instead of s->object_size
+ */
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
return object;
}
-static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
+static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
-static __always_inline
+static __fastpath_inline
void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags)
{
void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
- s->size, gfpflags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
return ret;
}
@@ -3276,45 +3484,84 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller)
{
- void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
+ return slab_alloc_node(s, NULL, gfpflags, node,
+ caller, orig_size);
}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
- s->object_size, s->size, gfpflags, node);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size)
+static noinline void free_to_partial_list(
+ struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
{
- void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ struct slab *slab_free = NULL;
+ int cnt = bulk_cnt;
+ unsigned long flags;
+ depot_stack_handle_t handle = 0;
- trace_kmalloc_node(_RET_IP_, ret, s,
- size, s->size, gfpflags, node);
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
+ void *prior = slab->freelist;
+
+ /* Perform the actual freeing while we still hold the locks */
+ slab->inuse -= cnt;
+ set_freepointer(s, tail, prior);
+ slab->freelist = head;
+
+ /*
+ * If the slab is empty, and node's partial list is full,
+ * it should be discarded anyway no matter it's on full or
+ * partial list.
+ */
+ if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
+ slab_free = slab;
+
+ if (!prior) {
+ /* was on full list */
+ remove_full(s, n, slab);
+ if (!slab_free) {
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ } else if (slab_free) {
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ }
+ }
+
+ if (slab_free) {
+ /*
+ * Update the counters while still holding n->list_lock to
+ * prevent spurious validation warnings
+ */
+ dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (slab_free) {
+ stat(s, FREE_SLAB);
+ free_slab(s, slab_free);
+ }
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
@@ -3341,9 +3588,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
if (kfence_free(head))
return;
- if (kmem_cache_debug(s) &&
- !free_debug_processing(s, slab, head, tail, cnt, addr))
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+ free_to_partial_list(s, slab, head, tail, cnt, addr);
return;
+ }
do {
if (unlikely(n)) {
@@ -3441,6 +3689,7 @@ slab_empty:
discard_slab(s, slab);
}
+#ifndef CONFIG_SLUB_TINY
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
* can perform fastpath freeing without additional function calls.
@@ -3463,6 +3712,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+ void **freelist;
redo:
/*
@@ -3477,9 +3727,13 @@ redo:
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
- if (likely(slab == c->slab)) {
-#ifndef CONFIG_PREEMPT_RT
- void **freelist = READ_ONCE(c->freelist);
+ if (unlikely(slab != c->slab)) {
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
+ return;
+ }
+
+ if (USE_LOCKLESS_FAST_PATH()) {
+ freelist = READ_ONCE(c->freelist);
set_freepointer(s, tail_obj, freelist);
@@ -3491,16 +3745,8 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
-#else /* CONFIG_PREEMPT_RT */
- /*
- * We cannot use the lockless fastpath on PREEMPT_RT because if
- * a slowpath has taken the local_lock_irqsave(), it is not
- * protected against a fast path operation in an irq handler. So
- * we need to take the local_lock. We shouldn't simply defer to
- * __slab_free() as that wouldn't use the cpu freelist at all.
- */
- void **freelist;
-
+ } else {
+ /* Update the free list under the local lock */
local_lock(&s->cpu_slab->lock);
c = this_cpu_ptr(s->cpu_slab);
if (unlikely(slab != c->slab)) {
@@ -3515,14 +3761,21 @@ redo:
c->tid = next_tid(tid);
local_unlock(&s->cpu_slab->lock);
-#endif
- stat(s, FREE_FASTPATH);
- } else
- __slab_free(s, slab, head, tail_obj, cnt, addr);
+ }
+ stat(s, FREE_FASTPATH);
+}
+#else /* CONFIG_SLUB_TINY */
+static void do_slab_free(struct kmem_cache *s,
+ struct slab *slab, void *head, void *tail,
+ int cnt, unsigned long addr)
+{
+ void *tail_obj = tail ? : head;
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
}
+#endif /* CONFIG_SLUB_TINY */
-static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
+static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, void **p, int cnt,
unsigned long addr)
{
@@ -3542,12 +3795,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
}
#endif
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller)
+{
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller);
+}
+
void kmem_cache_free(struct kmem_cache *s, void *x)
{
s = cache_from_obj(s, x);
if (!s)
return;
- trace_kmem_cache_free(_RET_IP_, x, s->name);
+ trace_kmem_cache_free(_RET_IP_, x, s);
slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3560,19 +3818,6 @@ struct detached_freelist {
struct kmem_cache *s;
};
-static inline void free_large_kmalloc(struct folio *folio, void *object)
-{
- unsigned int order = folio_order(folio);
-
- if (WARN_ON_ONCE(order == 0))
- pr_warn_once("object pointer: 0x%p\n", object);
-
- kfree_hook(object);
- mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(folio_page(folio, 0), order);
-}
-
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
@@ -3663,18 +3908,13 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/* Note that interrupts must be enabled when calling this function. */
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+#ifndef CONFIG_SLUB_TINY
+static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, struct obj_cgroup *objcg)
{
struct kmem_cache_cpu *c;
int i;
- struct obj_cgroup *objcg = NULL;
- /* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
- if (unlikely(!s))
- return false;
/*
* Drain objects in the per cpu slab, while disabling local
* IRQs, which protects against PREEMPT and interrupts
@@ -3709,7 +3949,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, s->object_size);
if (unlikely(!p[i]))
goto error;
@@ -3728,19 +3968,72 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_unlock_irq(&s->cpu_slab->lock);
slub_put_cpu_ptr(s->cpu_slab);
- /*
- * memcg and kmem_cache debug support and memory initialization.
- * Done outside of the IRQ disabled fastpath loop.
- */
- slab_post_alloc_hook(s, objcg, flags, size, p,
- slab_want_init_on_alloc(flags, s));
return i;
+
error:
slub_put_cpu_ptr(s->cpu_slab);
- slab_post_alloc_hook(s, objcg, flags, i, p, false);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
+ kmem_cache_free_bulk(s, i, p);
+ return 0;
+
+}
+#else /* CONFIG_SLUB_TINY */
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, struct obj_cgroup *objcg)
+{
+ int i;
+
+ for (i = 0; i < size; i++) {
+ void *object = kfence_alloc(s, s->object_size, flags);
+
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
+
+ p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
+ _RET_IP_, s->object_size);
+ if (unlikely(!p[i]))
+ goto error;
+
+ maybe_wipe_obj_freeptr(s, p[i]);
+ }
+
+ return i;
+
+error:
+ slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
kmem_cache_free_bulk(s, i, p);
return 0;
}
+#endif /* CONFIG_SLUB_TINY */
+
+/* Note that interrupts must be enabled when calling this function. */
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ int i;
+ struct obj_cgroup *objcg = NULL;
+
+ if (!size)
+ return 0;
+
+ /* memcg and kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
+ if (unlikely(!s))
+ return 0;
+
+ i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg);
+
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ if (i != 0)
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s), s->object_size);
+ return i;
+}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3764,7 +4057,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
* take the list_lock.
*/
static unsigned int slub_min_order;
-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static unsigned int slub_max_order =
+ IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
static unsigned int slub_min_objects;
/*
@@ -3895,10 +4189,12 @@ init_kmem_cache_node(struct kmem_cache_node *n)
#endif
}
+#ifndef CONFIG_SLUB_TINY
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
- KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
+ NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
+ sizeof(struct kmem_cache_cpu));
/*
* Must align to double word boundary for the double cmpxchg
@@ -3914,6 +4210,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
return 1;
}
+#else
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ return 1;
+}
+#endif /* CONFIG_SLUB_TINY */
static struct kmem_cache *kmem_cache_node;
@@ -3936,6 +4238,7 @@ static void early_kmem_cache_node_alloc(int node)
slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!slab);
+ inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects);
if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
@@ -3950,7 +4253,6 @@ static void early_kmem_cache_node_alloc(int node)
n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
slab->freelist = get_freepointer(kmem_cache_node, n);
slab->inuse = 1;
- slab->frozen = 0;
kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, slab->objects);
@@ -3976,7 +4278,9 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
+#ifndef CONFIG_SLUB_TINY
free_percpu(s->cpu_slab);
+#endif
free_kmem_cache_nodes(s);
}
@@ -4083,7 +4387,8 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->inuse = size;
- if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ if (slub_debug_orig_size(s) ||
+ (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
s->ctor) {
/*
@@ -4112,12 +4417,17 @@ static int calculate_sizes(struct kmem_cache *s)
}
#ifdef CONFIG_SLUB_DEBUG
- if (flags & SLAB_STORE_USER)
+ if (flags & SLAB_STORE_USER) {
/*
* Need to store information about allocs and frees after
* the object.
*/
size += 2 * sizeof(struct track);
+
+ /* Save the original kmalloc request size */
+ if (flags & SLAB_KMALLOC)
+ size += sizeof(unsigned int);
+ }
#endif
kasan_cache_create(s, &size, &s->flags);
@@ -4237,23 +4547,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
- unsigned long flags;
- unsigned long *map;
void *p;
slab_err(s, slab, text, s->name);
- slab_lock(slab, &flags);
- map = get_map(s, slab);
+ spin_lock(&object_map_lock);
+ __fill_map(object_map, s, slab);
+
for_each_object(p, s, addr, slab->objects) {
- if (!test_bit(__obj_to_index(s, addr, p), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
- put_map(map);
- slab_unlock(slab, &flags);
+ spin_unlock(&object_map_lock);
#endif
}
@@ -4404,78 +4712,6 @@ static int __init setup_slub_min_objects(char *str)
__setup("slub_min_objects=", setup_slub_min_objects);
-void *__kmalloc(size_t size, gfp_t flags)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, flags);
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
-
- trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc);
-
-#ifdef CONFIG_NUMA
-static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
-{
- struct page *page;
- void *ptr = NULL;
- unsigned int order = get_order(size);
-
- flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, order);
- if (page) {
- ptr = page_address(page);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- }
-
- return kmalloc_large_node_hook(ptr, size, flags);
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, flags, node);
-
- trace_kmalloc_node(_RET_IP_, ret, NULL,
- size, PAGE_SIZE << get_order(size),
- flags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
-
- trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -4526,43 +4762,6 @@ void __check_heap_object(const void *ptr, unsigned long n,
}
#endif /* CONFIG_HARDENED_USERCOPY */
-size_t __ksize(const void *object)
-{
- struct folio *folio;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return 0;
-
- folio = virt_to_folio(object);
-
- if (unlikely(!folio_test_slab(folio)))
- return folio_size(folio);
-
- return slab_ksize(folio_slab(folio)->slab_cache);
-}
-EXPORT_SYMBOL(__ksize);
-
-void kfree(const void *x)
-{
- struct folio *folio;
- struct slab *slab;
- void *object = (void *)x;
-
- trace_kfree(_RET_IP_, x);
-
- if (unlikely(ZERO_OR_NULL_PTR(x)))
- return;
-
- folio = virt_to_folio(x);
- if (unlikely(!folio_test_slab(folio))) {
- free_large_kmalloc(folio, object);
- return;
- }
- slab = folio_slab(folio);
- slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_);
-}
-EXPORT_SYMBOL(kfree);
-
#define SHRINK_PROMOTE_MAX 32
/*
@@ -4611,6 +4810,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
if (free == slab->objects) {
list_move(&slab->slab_list, &discard);
n->nr_partial--;
+ dec_slabs_node(s, node, slab->objects);
} else if (free <= SHRINK_PROMOTE_MAX)
list_move(&slab->slab_list, promote + free - 1);
}
@@ -4626,7 +4826,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
/* Release empty slabs */
list_for_each_entry_safe(slab, t, &discard, slab_list)
- discard_slab(s, slab);
+ free_slab(s, slab);
if (slabs_node(s, node))
ret = 1;
@@ -4757,11 +4957,6 @@ static int slab_memory_callback(struct notifier_block *self,
return ret;
}
-static struct notifier_block slab_memory_callback_nb = {
- .notifier_call = slab_memory_callback,
- .priority = SLAB_CALLBACK_PRI,
-};
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
@@ -4827,7 +5022,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
- register_hotmemory_notifier(&slab_memory_callback_nb);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
@@ -4858,6 +5053,10 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
+#ifndef CONFIG_SLUB_TINY
+ flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!flushwq);
+#endif
}
struct kmem_cache *
@@ -4908,61 +5107,7 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
}
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, gfpflags);
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, NULL, gfpflags, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, gfpflags, node);
-
- trace_kmalloc_node(caller, ret, NULL,
- size, PAGE_SIZE << get_order(size),
- gfpflags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
-
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
static int count_inuse(struct slab *slab)
{
return slab->inuse;
@@ -4980,12 +5125,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
{
void *p;
void *addr = slab_address(slab);
- unsigned long flags;
-
- slab_lock(slab, &flags);
if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
- goto unlock;
+ return;
/* Now we know that a valid freelist exists */
__fill_map(obj_map, s, slab);
@@ -4996,8 +5138,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
if (!check_object(s, slab, p, val))
break;
}
-unlock:
- slab_unlock(slab, &flags);
}
static int validate_slab_node(struct kmem_cache *s,
@@ -5068,6 +5208,7 @@ struct location {
depot_stack_handle_t handle;
unsigned long count;
unsigned long addr;
+ unsigned long waste;
long long sum_time;
long min_time;
long max_time;
@@ -5114,13 +5255,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
}
static int add_location(struct loc_track *t, struct kmem_cache *s,
- const struct track *track)
+ const struct track *track,
+ unsigned int orig_size)
{
long start, end, pos;
struct location *l;
- unsigned long caddr, chandle;
+ unsigned long caddr, chandle, cwaste;
unsigned long age = jiffies - track->when;
depot_stack_handle_t handle = 0;
+ unsigned int waste = s->object_size - orig_size;
#ifdef CONFIG_STACKDEPOT
handle = READ_ONCE(track->handle);
@@ -5138,11 +5281,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (pos == end)
break;
- caddr = t->loc[pos].addr;
- chandle = t->loc[pos].handle;
- if ((track->addr == caddr) && (handle == chandle)) {
+ l = &t->loc[pos];
+ caddr = l->addr;
+ chandle = l->handle;
+ cwaste = l->waste;
+ if ((track->addr == caddr) && (handle == chandle) &&
+ (waste == cwaste)) {
- l = &t->loc[pos];
l->count++;
if (track->when) {
l->sum_time += age;
@@ -5167,6 +5312,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
end = pos;
else if (track->addr == caddr && handle < chandle)
end = pos;
+ else if (track->addr == caddr && handle == chandle &&
+ waste < cwaste)
+ end = pos;
else
start = pos;
}
@@ -5190,6 +5338,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->min_pid = track->pid;
l->max_pid = track->pid;
l->handle = handle;
+ l->waste = waste;
cpumask_clear(to_cpumask(l->cpus));
cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
@@ -5202,18 +5351,21 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
unsigned long *obj_map)
{
void *addr = slab_address(slab);
+ bool is_alloc = (alloc == TRACK_ALLOC);
void *p;
__fill_map(obj_map, s, slab);
for_each_object(p, s, addr, slab->objects)
if (!test_bit(__obj_to_index(s, addr, p), obj_map))
- add_location(t, s, get_track(s, p, alloc));
+ add_location(t, s, get_track(s, p, alloc),
+ is_alloc ? get_orig_size(s, p) :
+ s->object_size);
}
#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SLUB_DEBUG */
-#ifdef CONFIG_SYSFS
+#ifdef SLAB_SUPPORTS_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
SL_PARTIAL, /* Only partially allocated slabs */
@@ -5533,11 +5685,13 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
SLAB_ATTR_RO(cache_dma);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
return sysfs_emit(buf, "%u\n", s->usersize);
}
SLAB_ATTR_RO(usersize);
+#endif
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
@@ -5601,7 +5755,7 @@ static ssize_t validate_store(struct kmem_cache *s,
{
int ret = -EINVAL;
- if (buf[0] == '1') {
+ if (buf[0] == '1' && kmem_cache_debug(s)) {
ret = validate_slab_cache(s);
if (ret >= 0)
ret = length;
@@ -5617,7 +5771,21 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-SLAB_ATTR_RO(failslab);
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ if (s->refcount > 1)
+ return -EINVAL;
+
+ if (buf[0] == '1')
+ WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
+ else
+ WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
+
+ return length;
+}
+SLAB_ATTR(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5745,6 +5913,29 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
#endif /* CONFIG_SLUB_STATS */
+#ifdef CONFIG_KFENCE
+static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
+}
+
+static ssize_t skip_kfence_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = length;
+
+ if (buf[0] == '0')
+ s->flags &= ~SLAB_SKIP_KFENCE;
+ else if (buf[0] == '1')
+ s->flags |= SLAB_SKIP_KFENCE;
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+SLAB_ATTR(skip_kfence);
+#endif
+
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
&object_size_attr.attr,
@@ -5811,7 +6002,12 @@ static struct attribute *slab_attrs[] = {
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
&usersize_attr.attr,
+#endif
+#ifdef CONFIG_KFENCE
+ &skip_kfence_attr.attr,
+#endif
NULL
};
@@ -5826,7 +6022,6 @@ static ssize_t slab_attr_show(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5834,9 +6029,7 @@ static ssize_t slab_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- err = attribute->show(s, buf);
-
- return err;
+ return attribute->show(s, buf);
}
static ssize_t slab_attr_store(struct kobject *kobj,
@@ -5845,7 +6038,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5853,8 +6045,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- err = attribute->store(s, buf, len);
- return err;
+ return attribute->store(s, buf, len);
}
static void kmem_cache_release(struct kobject *k)
@@ -5879,7 +6070,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
return slab_kset;
}
-#define ID_STR_LENGTH 64
+#define ID_STR_LENGTH 32
/* Create a unique string id for a slab cache:
*
@@ -5890,7 +6081,8 @@ static char *create_unique_id(struct kmem_cache *s)
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5912,9 +6104,13 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'A';
if (p != name + 1)
*p++ = '-';
- p += sprintf(p, "%07u", s->size);
+ p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
- BUG_ON(p > name + ID_STR_LENGTH - 1);
+ if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+ kmsan_unpoison_memory(name, p - name);
return name;
}
@@ -5925,11 +6121,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
- if (!kset) {
- kobject_init(&s->kobj, &slab_ktype);
- return 0;
- }
-
if (!unmergeable && disable_higher_order_debug &&
(slub_debug & DEBUG_METADATA_FLAGS))
unmergeable = 1;
@@ -5948,6 +6139,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
@@ -6016,6 +6209,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
al->name = name;
al->next = alias_list;
alias_list = al;
+ kmsan_unpoison_memory(al, sizeof(*al));
return 0;
}
@@ -6056,9 +6250,8 @@ static int __init slab_sysfs_init(void)
mutex_unlock(&slab_mutex);
return 0;
}
-
-__initcall(slab_sysfs_init);
-#endif /* CONFIG_SYSFS */
+late_initcall(slab_sysfs_init);
+#endif /* SLAB_SUPPORTS_SYSFS */
#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
static int slab_debugfs_show(struct seq_file *seq, void *v)
@@ -6078,6 +6271,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v)
else
seq_puts(seq, "<not-available>");
+ if (l->waste)
+ seq_printf(seq, " waste=%lu/%lu",
+ l->count * l->waste, l->waste);
+
if (l->sum_time != l->min_time) {
seq_printf(seq, " age=%ld/%llu/%ld",
l->min_time, div_u64(l->sum_time, l->count),
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 46ae542118c0..c5398a5960d0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -196,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
+void __weak __meminit pmd_init(void *addr)
+{
+}
+
pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -203,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
+ pmd_init(p);
pud_populate(&init_mm, pud, p);
}
return pud;
}
+void __weak __meminit pud_init(void *addr)
+{
+}
+
p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
@@ -215,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
+ pud_init(p);
p4d_populate(&init_mm, p4d, p);
}
return p4d;
@@ -285,6 +295,69 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
return vmemmap_populate_range(start, end, node, altmap, NULL);
}
+void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+}
+
+int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ return 0;
+}
+
+int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(p4d, addr, node);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ continue;
+ } else if (altmap) {
+ /*
+ * No fallback: In any case we care about, the
+ * altmap should be reasonably sized and aligned
+ * such that vmemmap_alloc_block_buf() will always
+ * succeed. For consistency with the PTE case,
+ * return an error here as failure could indicate
+ * a configuration issue with the size of the altmap.
+ */
+ return -ENOMEM;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next))
+ continue;
+ if (vmemmap_populate_basepages(addr, next, node, altmap))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
/*
* For compound pages bigger than section size (e.g. x86 1G compound
* pages with 2M subsection size) fill the rest of sections as tail
diff --git a/mm/sparse.c b/mm/sparse.c
index e5a8a3a0edd7..2779b419ef2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
- nr_pages - map_offset);
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 9cee7f6a3809..70e2063ef43a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -43,8 +43,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>
-/* How many pages do we try to swap or page in/out together? */
+/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
+const int page_cluster_max = 31;
/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
@@ -295,8 +296,20 @@ void folio_rotate_reclaimable(struct folio *folio)
}
}
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
{
+ unsigned long cost;
+
+ /*
+ * Reflect the relative cost of incurring IO and spending CPU
+ * time on rotations. This doesn't attempt to make a precise
+ * comparison, it just says: if reloads are about comparable
+ * between the LRU lists, or rotations are overwhelmingly
+ * different between them, adjust scan balance for CPU work.
+ */
+ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+
do {
unsigned long lrusize;
@@ -310,9 +323,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
- lruvec->file_cost += nr_pages;
+ lruvec->file_cost += cost;
else
- lruvec->anon_cost += nr_pages;
+ lruvec->anon_cost += cost;
/*
* Decay previous events
@@ -335,10 +348,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
-void lru_note_cost_folio(struct folio *folio)
+void lru_note_cost_refault(struct folio *folio)
{
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio));
+ folio_nr_pages(folio), 0);
}
static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
@@ -366,7 +379,7 @@ static void folio_activate_drain(int cpu)
folio_batch_move_lru(fbatch, folio_activate_fn);
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
@@ -385,7 +398,7 @@ static inline void folio_activate_drain(int cpu)
{
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
@@ -428,6 +441,40 @@ static void __lru_cache_activate_folio(struct folio *folio)
local_unlock(&cpu_fbatches.lock);
}
+#ifdef CONFIG_LRU_GEN
+static void folio_inc_refs(struct folio *folio)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ if (folio_test_unevictable(folio))
+ return;
+
+ if (!folio_test_referenced(folio)) {
+ folio_set_referenced(folio);
+ return;
+ }
+
+ if (!folio_test_workingset(folio)) {
+ folio_set_workingset(folio);
+ return;
+ }
+
+ /* see the comment on MAX_NR_TIERS */
+ do {
+ new_flags = old_flags & LRU_REFS_MASK;
+ if (new_flags == LRU_REFS_MASK)
+ break;
+
+ new_flags += BIT(LRU_REFS_PGOFF);
+ new_flags |= old_flags & ~LRU_REFS_MASK;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+}
+#else
+static void folio_inc_refs(struct folio *folio)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* Mark a page as having seen activity.
*
@@ -440,6 +487,11 @@ static void __lru_cache_activate_folio(struct folio *folio)
*/
void folio_mark_accessed(struct folio *folio)
{
+ if (lru_gen_enabled()) {
+ folio_inc_refs(folio);
+ return;
+ }
+
if (!folio_test_referenced(folio)) {
folio_set_referenced(folio);
} else if (folio_test_unevictable(folio)) {
@@ -484,6 +536,11 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ /* see the comment in lru_gen_add_folio() */
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+ folio_set_active(folio);
+
folio_get(folio);
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
@@ -493,22 +550,21 @@ void folio_add_lru(struct folio *folio)
EXPORT_SYMBOL(folio_add_lru);
/**
- * lru_cache_add_inactive_or_unevictable
- * @page: the page to be added to LRU
- * @vma: vma in which page is mapped for determining reclaimability
+ * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
+ * @folio: The folio to be added to the LRU.
+ * @vma: VMA in which the folio is mapped.
*
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * If the VMA is mlocked, @folio is added to the unevictable list.
+ * Otherwise, it is treated the same way as folio_add_lru().
*/
-void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
- mlock_new_page(page);
+ mlock_new_page(&folio->page);
else
- lru_cache_add(page);
+ folio_add_lru(folio);
}
/*
@@ -575,7 +631,7 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
long nr_pages = folio_nr_pages(folio);
lruvec_del_folio(lruvec, folio);
@@ -688,8 +744,8 @@ void deactivate_page(struct page *page)
{
struct folio *folio = page_folio(page);
- if (folio_test_lru(folio) && folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
+ (folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
@@ -925,22 +981,30 @@ void lru_cache_disable(void)
/**
* release_pages - batched put_page()
- * @pages: array of pages to release
+ * @arg: array of pages to release
* @nr: number of pages
*
- * Decrement the reference count on all the pages in @pages. If it
+ * Decrement the reference count on all the pages in @arg. If it
* fell to zero, remove the page from the LRU and free it.
+ *
+ * Note that the argument can be an array of pages, encoded pages,
+ * or folio pointers. We ignore any encoded bits, and turn any of
+ * them into just a folio that gets free'd.
*/
-void release_pages(struct page **pages, int nr)
+void release_pages(release_pages_arg arg, int nr)
{
int i;
+ struct encoded_page **encoded = arg.encoded_pages;
LIST_HEAD(pages_to_free);
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
- struct folio *folio = page_folio(pages[i]);
+ struct folio *folio;
+
+ /* Turn any of the argument types into a folio */
+ folio = page_folio(encoded_page_ptr(encoded[i]));
/*
* Make sure the IRQ-safe lock-holding time does not get
diff --git a/mm/swap.h b/mm/swap.h
index 17936e068c1c..f78065c8ef52 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
-void end_swap_bio_write(struct bio *bio);
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func);
+int __swap_writepage(struct page *page, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
@@ -34,17 +32,17 @@ extern struct address_space *swapper_spaces[];
void show_swap_cache_info(void);
bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
-int add_to_swap_cache(struct page *page, swp_entry_t entry,
+int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp);
void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow);
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
-struct page *lookup_swap_cache(swp_entry_t entry,
- struct vm_area_struct *vma,
- unsigned long addr);
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
+struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr);
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index);
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
@@ -101,17 +99,17 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
-static inline struct page *lookup_swap_cache(swp_entry_t swp,
- struct vm_area_struct *vma,
- unsigned long addr)
+static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr)
{
return NULL;
}
static inline
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
- return find_get_page(mapping, index);
+ return filemap_get_folio(mapping, index);
}
static inline bool add_to_swap(struct folio *folio)
@@ -124,7 +122,7 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
return NULL;
}
-static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
+static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp_mask, void **shadowp)
{
return -1;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 5a9442979a18..db6c4a26cf59 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -170,6 +170,9 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
+ if (mem_cgroup_disabled())
+ return 0;
+
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array = vcalloc(length, sizeof(void *));
@@ -204,6 +207,9 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
+ if (mem_cgroup_disabled())
+ return;
+
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 10b94d64cc25..0bec1f705f8e 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -343,7 +343,7 @@ repeat:
get_swap_pages(1, &entry, 1);
out:
if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_page(&folio->page, entry);
+ put_swap_folio(folio, entry);
entry.val = 0;
}
return entry;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e166051566f4..2927507b43d8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -85,21 +85,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry,
+int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = thp_nr_pages(page);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
+ unsigned long i, nr = folio_nr_pages(folio);
void *old;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapCache(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
- page_ref_add(page, nr);
- SetPageSwapCache(page);
+ folio_ref_add(folio, nr);
+ folio_set_swapcache(folio);
do {
xas_lock_irq(&xas);
@@ -107,19 +107,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
if (xas_error(&xas))
goto unlock;
for (i = 0; i < nr; i++) {
- VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
old = xas_load(&xas);
if (xa_is_value(old)) {
if (shadowp)
*shadowp = old;
}
- set_page_private(page + i, entry.val + i);
- xas_store(&xas, page);
+ set_page_private(folio_page(folio, i), entry.val + i);
+ xas_store(&xas, folio);
xas_next(&xas);
}
address_space->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
+ __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -127,8 +127,8 @@ unlock:
if (!xas_error(&xas))
return 0;
- ClearPageSwapCache(page);
- page_ref_sub(page, nr);
+ folio_clear_swapcache(folio);
+ folio_ref_sub(folio, nr);
return xas_error(&xas);
}
@@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio,
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_FOLIO(entry != folio, folio);
+ VM_BUG_ON_PAGE(entry != folio, entry);
set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
@@ -194,7 +194,7 @@ bool add_to_swap(struct folio *folio)
/*
* Add it to the swap cache.
*/
- err = add_to_swap_cache(&folio->page, entry,
+ err = add_to_swap_cache(folio, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
@@ -218,7 +218,7 @@ bool add_to_swap(struct folio *folio)
return true;
fail:
- put_swap_page(&folio->page, entry);
+ put_swap_folio(folio, entry);
return false;
}
@@ -237,7 +237,7 @@ void delete_from_swap_cache(struct folio *folio)
__delete_from_swap_cache(folio, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
- put_swap_page(&folio->page, entry);
+ put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
@@ -272,16 +272,19 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
/*
* If we are the only user, then try to free up the swap cache.
*
- * Its ok to check for PageSwapCache without the page lock
+ * Its ok to check the swapcache flag without the folio lock
* here because we are going to recheck again inside
- * try_to_free_swap() _with_ the lock.
+ * folio_free_swap() _with_ the lock.
* - Marcelo
*/
void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
- try_to_free_swap(page);
- unlock_page(page);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
}
}
@@ -300,15 +303,12 @@ void free_page_and_swap_cache(struct page *page)
* Passed an array of pages, drop them all from swapcache and then release
* them. They are removed from the LRU and freed if this is their last use.
*/
-void free_pages_and_swap_cache(struct page **pages, int nr)
+void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
- struct page **pagep = pages;
- int i;
-
lru_add_drain();
- for (i = 0; i < nr; i++)
- free_swap_cache(pagep[i]);
- release_pages(pagep, nr);
+ for (int i = 0; i < nr; i++)
+ free_swap_cache(encoded_page_ptr(pages[i]));
+ release_pages(pages, nr);
}
static inline bool swap_use_vma_readahead(void)
@@ -317,24 +317,24 @@ static inline bool swap_use_vma_readahead(void)
}
/*
- * Lookup a swap entry in the swap cache. A found page will be returned
+ * Lookup a swap entry in the swap cache. A found folio will be returned
* unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the page
+ * lock getting page table operations atomic even if we drop the folio
* lock before returning.
*/
-struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
- unsigned long addr)
+struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
+ struct folio *folio;
struct swap_info_struct *si;
si = get_swap_device(entry);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
put_swap_device(si);
- if (page) {
+ if (folio) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -342,10 +342,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
* At the moment, we don't support PG_readahead for anon THP
* so let's bail out rather than confusing the readahead stat.
*/
- if (unlikely(PageTransCompound(page)))
- return page;
+ if (unlikely(folio_test_large(folio)))
+ return folio;
- readahead = TestClearPageReadahead(page);
+ readahead = folio_test_clear_readahead(folio);
if (vma && vma_ra) {
unsigned long ra_val;
int win, hits;
@@ -366,34 +366,32 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
}
}
- return page;
+ return folio;
}
/**
- * find_get_incore_page - Find and get a page from the page or swap caches.
+ * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
* @mapping: The address_space to search.
* @index: The page cache index.
*
- * This differs from find_get_page() in that it will also look for the
- * page in the swap cache.
+ * This differs from filemap_get_folio() in that it will also look for the
+ * folio in the swap cache.
*
- * Return: The found page or %NULL.
+ * Return: The found folio or %NULL.
*/
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct page *page = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
- if (!page)
- return page;
- if (!xa_is_value(page))
- return find_subpage(page, index);
+ if (!xa_is_value(folio))
+ goto out;
if (!shmem_mapping(mapping))
return NULL;
- swp = radix_to_swp_entry(page);
+ swp = radix_to_swp_entry(folio);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swp))
return NULL;
@@ -401,9 +399,11 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
si = get_swap_device(swp);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ index = swp_offset(swp);
+ folio = filemap_get_folio(swap_address_space(swp), index);
put_swap_device(si);
- return page;
+out:
+ return folio;
}
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
@@ -411,7 +411,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
bool *new_page_allocated)
{
struct swap_info_struct *si;
- struct page *page;
+ struct folio *folio;
void *shadow = NULL;
*new_page_allocated = false;
@@ -420,17 +420,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
int err;
/*
* First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
+ * called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
si = get_swap_device(entry);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
+ folio = filemap_get_folio(swap_address_space(entry),
+ swp_offset(entry));
put_swap_device(si);
- if (page)
- return page;
+ if (folio)
+ return folio_file_page(folio, swp_offset(entry));
/*
* Just skip read ahead for unused swap slot.
@@ -448,8 +448,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
* cause any racers to loop around until we add it to cache.
*/
- page = alloc_page_vma(gfp_mask, vma, addr);
- if (!page)
+ folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
+ if (!folio)
return NULL;
/*
@@ -459,7 +459,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (!err)
break;
- put_page(page);
+ folio_put(folio);
if (err != -EEXIST)
return NULL;
@@ -477,30 +477,30 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* The swap entry is ours to swap in. Prepare the new page.
*/
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
- if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
+ if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
goto fail_unlock;
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
+ if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
- workingset_refault(page_folio(page), shadow);
+ workingset_refault(folio, shadow);
- /* Caller will initiate read into locked page */
- lru_cache_add(page);
+ /* Caller will initiate read into locked folio */
+ folio_add_lru(folio);
*new_page_allocated = true;
- return page;
+ return &folio->page;
fail_unlock:
- put_swap_page(page, entry);
- unlock_page(page);
- put_page(page);
+ put_swap_folio(folio, entry);
+ folio_unlock(folio);
+ folio_put(folio);
return NULL;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1fdccd2f1422..908a529bca12 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -63,6 +63,10 @@ EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority = -1;
+unsigned long swapfile_maximum_size;
+#ifdef CONFIG_MIGRATION
+bool swap_migration_ad_supported;
+#endif /* CONFIG_MIGRATION */
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
@@ -128,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
- struct page *page;
+ struct folio *folio;
int ret = 0;
- page = find_get_page(swap_address_space(entry), offset);
- if (!page)
+ folio = filemap_get_folio(swap_address_space(entry), offset);
+ if (!folio)
return 0;
/*
* When this function is called from scan_swap_map_slots() and it's
- * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * called by vmscan.c at reclaiming folios. So we hold a folio lock
* here. We have to use trylock for avoiding deadlock. This is a special
- * case and you should use try_to_free_swap() with explicit lock_page()
+ * case and you should use folio_free_swap() with explicit folio_lock()
* in usual operations.
*/
- if (trylock_page(page)) {
+ if (folio_trylock(folio)) {
if ((flags & TTRS_ANYWAY) ||
- ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
- ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
- ret = try_to_free_swap(page);
- unlock_page(page);
+ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+ ret = folio_free_swap(folio);
+ folio_unlock(folio);
}
- put_page(page);
+ folio_put(folio);
return ret;
}
@@ -768,8 +772,7 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
/* No free swap slots available */
if (si->highest_bit <= si->lowest_bit)
return;
- next = si->lowest_bit +
- prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
next = max_t(unsigned int, next, si->lowest_bit);
}
@@ -969,23 +972,23 @@ done:
scan:
spin_unlock(&si->lock);
while (++offset <= READ_ONCE(si->highest_bit)) {
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
scanned_many = true;
}
+ if (swap_offset_available_and_locked(si, offset))
+ goto checks;
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
scanned_many = true;
}
+ if (swap_offset_available_and_locked(si, offset))
+ goto checks;
offset++;
}
spin_lock(&si->lock);
@@ -1328,7 +1331,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-void put_swap_page(struct page *page, swp_entry_t entry)
+void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1337,7 +1340,7 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
- int size = swap_entry_size(thp_nr_pages(page));
+ int size = swap_entry_size(folio_nr_pages(folio));
si = _swap_info_get(entry);
if (!si)
@@ -1427,30 +1430,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
spin_unlock(&p->lock);
}
-/*
- * How many references to page are currently swapped out?
- * This does not give an exact answer when swap count is continued,
- * but does include the high COUNT_CONTINUED flag to allow for that.
- */
-static int page_swapcount(struct page *page)
-{
- int count = 0;
- struct swap_info_struct *p;
- struct swap_cluster_info *ci;
- swp_entry_t entry;
- unsigned long offset;
-
- entry.val = page_private(page);
- p = _swap_info_get(entry);
- if (p) {
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(p, offset);
- count = swap_count(p->swap_map[offset]);
- unlock_cluster_or_swap_info(p, ci);
- }
- return count;
-}
-
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si;
@@ -1465,11 +1444,16 @@ int __swap_count(swp_entry_t entry)
return count;
}
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
- int count = 0;
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
+ int count;
ci = lock_cluster_or_swap_info(si, offset);
count = swap_count(si->swap_map[offset]);
@@ -1570,56 +1554,59 @@ unlock_out:
static bool folio_swapped(struct folio *folio)
{
- swp_entry_t entry;
- struct swap_info_struct *si;
+ swp_entry_t entry = folio_swap_entry(folio);
+ struct swap_info_struct *si = _swap_info_get(entry);
+
+ if (!si)
+ return false;
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
- return page_swapcount(&folio->page) != 0;
+ return swap_swapcount(si, entry) != 0;
- entry = folio_swap_entry(folio);
- si = _swap_info_get(entry);
- if (si)
- return swap_page_trans_huge_swapped(si, entry);
- return false;
+ return swap_page_trans_huge_swapped(si, entry);
}
-/*
- * If swap is getting full, or if there are no more mappings of this page,
- * then try_to_free_swap is called to free its swap space.
+/**
+ * folio_free_swap() - Free the swap space used for this folio.
+ * @folio: The folio to remove.
+ *
+ * If swap is getting full, or if there are no more mappings of this folio,
+ * then call folio_free_swap to free its swap space.
+ *
+ * Return: true if we were able to release the swap space.
*/
-int try_to_free_swap(struct page *page)
+bool folio_free_swap(struct folio *folio)
{
- struct folio *folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (!folio_test_swapcache(folio))
- return 0;
+ return false;
if (folio_test_writeback(folio))
- return 0;
+ return false;
if (folio_swapped(folio))
- return 0;
+ return false;
/*
* Once hibernation has begun to create its image of memory,
- * there's a danger that one of the calls to try_to_free_swap()
+ * there's a danger that one of the calls to folio_free_swap()
* - most probably a call from __try_to_reclaim_swap() while
* hibernation is allocating its own swap pages for the image,
* but conceivably even a call from memory reclaim - will free
- * the swap from a page which has already been recorded in the
- * image as a clean swapcache page, and then reuse its swap for
+ * the swap from a folio which has already been recorded in the
+ * image as a clean swapcache folio, and then reuse its swap for
* another page of the image. On waking from hibernation, the
- * original page might be freed under memory pressure, then
+ * original folio might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
* Hibernation suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
- return 0;
+ return false;
delete_from_swap_cache(folio);
folio_set_dirty(folio);
- return 1;
+ return true;
}
/*
@@ -1770,8 +1757,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
* force COW, vm_page_prot omits write permission from any private vma.
*/
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, swp_entry_t entry, struct page *page)
+ unsigned long addr, swp_entry_t entry, struct folio *folio)
{
+ struct page *page = folio_file_page(folio, swp_offset(entry));
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte;
@@ -1792,7 +1780,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte_t pteval;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+ pteval = swp_entry_to_pte(make_swapin_error_entry());
set_pte_at(vma->vm_mm, addr, pte, pteval);
swap_free(entry);
ret = 0;
@@ -1843,17 +1831,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned int type)
{
- struct page *page;
swp_entry_t entry;
pte_t *pte;
struct swap_info_struct *si;
- unsigned long offset;
int ret = 0;
volatile unsigned char *swap_map;
si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
+ struct folio *folio;
+ unsigned long offset;
+
if (!is_swap_pte(*pte))
continue;
@@ -1864,8 +1853,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
offset = swp_offset(entry);
pte_unmap(pte);
swap_map = &si->swap_map[offset];
- page = lookup_swap_cache(entry, vma, addr);
- if (!page) {
+ folio = swap_cache_get_folio(entry, vma, addr);
+ if (!folio) {
+ struct page *page;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
@@ -1875,25 +1865,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
&vmf);
+ if (page)
+ folio = page_folio(page);
}
- if (!page) {
+ if (!folio) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
goto try_next;
return -ENOMEM;
}
- lock_page(page);
- wait_on_page_writeback(page);
- ret = unuse_pte(vma, pmd, addr, entry, page);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ ret = unuse_pte(vma, pmd, addr, entry, folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
- try_to_free_swap(page);
- unlock_page(page);
- put_page(page);
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ folio_put(folio);
try_next:
pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1990,14 +1982,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
struct vm_area_struct *vma;
int ret = 0;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->anon_vma) {
ret = unuse_vma(vma, type);
if (ret)
break;
}
+
cond_resched();
}
mmap_read_unlock(mm);
@@ -2042,7 +2036,7 @@ static int try_to_unuse(unsigned int type)
struct list_head *p;
int retval = 0;
struct swap_info_struct *si = swap_info[type];
- struct page *page;
+ struct folio *folio;
swp_entry_t entry;
unsigned int i;
@@ -2092,21 +2086,21 @@ retry:
(i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
- page = find_get_page(swap_address_space(entry), i);
- if (!page)
+ folio = filemap_get_folio(swap_address_space(entry), i);
+ if (!folio)
continue;
/*
- * It is conceivable that a racing task removed this page from
- * swap cache just before we acquired the page lock. The page
+ * It is conceivable that a racing task removed this folio from
+ * swap cache just before we acquired the page lock. The folio
* might even be back in swap cache on another swap area. But
- * that is okay, try_to_free_swap() only removes stale pages.
+ * that is okay, folio_free_swap() only removes stale folios.
*/
- lock_page(page);
- wait_on_page_writeback(page);
- try_to_free_swap(page);
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
@@ -2816,7 +2810,7 @@ unsigned long generic_max_swapfile_size(void)
}
/* Can be overridden by an architecture for additional checks. */
-__weak unsigned long max_swapfile_size(void)
+__weak unsigned long arch_max_swapfile_size(void)
{
return generic_max_swapfile_size();
}
@@ -2856,7 +2850,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
p->cluster_next = 1;
p->cluster_nr = 0;
- maxpages = max_swapfile_size();
+ maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
@@ -3094,7 +3088,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
*/
for_each_possible_cpu(cpu) {
per_cpu(*p->cluster_next_cpu, cpu) =
- 1 + prandom_u32_max(p->highest_bit);
+ get_random_u32_inclusive(1, p->highest_bit);
}
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
@@ -3655,7 +3649,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
+ blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
}
}
@@ -3677,6 +3671,13 @@ static int __init swapfile_init(void)
for_each_node(nid)
plist_head_init(&swap_avail_heads[nid]);
+ swapfile_maximum_size = arch_max_swapfile_size();
+
+#ifdef CONFIG_MIGRATION
+ if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
+ swap_migration_ad_supported = true;
+#endif /* CONFIG_MIGRATION */
+
return 0;
}
subsys_initcall(swapfile_init);
diff --git a/mm/truncate.c b/mm/truncate.c
index 0b0708bf935f..7b4ea4c4a46b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -240,7 +240,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
- if (split_huge_page(&folio->page) == 0)
+ if (split_folio(folio) == 0)
return true;
if (folio_test_dirty(folio))
return false;
@@ -361,9 +361,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
- index = indices[folio_batch_count(&fbatch) - 1] + 1;
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
for (i = 0; i < folio_batch_count(&fbatch); i++)
truncate_cleanup_folio(fbatch.folios[i]);
@@ -401,7 +400,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
@@ -415,21 +414,18 @@ void truncate_inode_pages_range(struct address_space *mapping,
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
if (xa_is_value(folio))
continue;
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
truncate_inode_folio(mapping, folio);
folio_unlock(folio);
- index = folio_index(folio) + folio_nr_pages(folio) - 1;
}
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
folio_batch_release(&fbatch);
- index++;
}
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -510,20 +506,17 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
int i;
folio_batch_init(&fbatch);
- while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
+ while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
- index = indices[i];
if (xa_is_value(folio)) {
count += invalidate_exceptional_entry(mapping,
- index,
- folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
ret = mapping_evict_folio(mapping, folio);
folio_unlock(folio);
@@ -542,7 +535,6 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
return count;
}
@@ -573,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
- * sitting in the lru_cache_add() pagevecs.
+ * sitting in the folio_add_lru() pagevecs.
*/
static int invalidate_complete_folio2(struct address_space *mapping,
struct folio *folio)
@@ -641,16 +633,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
- while (find_get_entries(mapping, index, end, &fbatch, indices)) {
+ while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
- index = indices[i];
if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
- index, folio))
+ indices[i], folio))
ret = -EBUSY;
continue;
}
@@ -660,13 +651,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* If folio is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
+ unmap_mapping_pages(mapping, indices[i],
+ (1 + end - indices[i]), false);
did_range_unmap = 1;
}
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
if (folio->mapping != mapping) {
folio_unlock(folio);
continue;
@@ -689,7 +680,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
/*
* For DAX we invalidate page tables after invalidating page cache. We
diff --git a/mm/usercopy.c b/mm/usercopy.c
index c1ee15a98633..4c3164beacec 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -258,7 +259,7 @@ static bool enable_checks __initdata = true;
static int __init parse_hardened_usercopy(char *str)
{
- if (strtobool(str, &enable_checks))
+ if (kstrtobool(str, &enable_checks))
pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
str);
return 1;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7327b2573f7c..0499907b6f1a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -64,8 +64,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
- bool page_in_cache = page->mapping;
+ bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
+ struct folio *folio;
struct inode *inode;
pgoff_t offset, max_off;
@@ -113,14 +114,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
if (!pte_none_mostly(*dst_pte))
goto out_unlock;
+ folio = page_folio(page);
if (page_in_cache) {
/* Usually, cache pages are already added to LRU */
if (newly_allocated)
- lru_cache_add(page);
+ folio_add_lru(folio);
page_add_file_rmap(page, dst_vma, false);
} else {
page_add_new_anon_rmap(page, dst_vma, dst_addr);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ folio_add_lru_vma(folio, dst_vma);
}
/*
@@ -157,11 +159,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
- page_kaddr = kmap_atomic(page);
+ page_kaddr = kmap_local_page(page);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
@@ -243,20 +262,22 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
{
struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct folio *folio;
struct page *page;
int ret;
- ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC);
- /* Our caller expects us to return -EFAULT if we failed to find page. */
+ ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
+ /* Our caller expects us to return -EFAULT if we failed to find folio */
if (ret == -ENOENT)
ret = -EFAULT;
if (ret)
goto out;
- if (!page) {
+ if (!folio) {
ret = -EFAULT;
goto out;
}
+ page = folio_file_page(folio, pgoff);
if (PageHWPoison(page)) {
ret = -EIO;
goto out_release;
@@ -267,13 +288,13 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
if (ret)
goto out_release;
- unlock_page(page);
+ folio_unlock(folio);
ret = 0;
out:
return ret;
out_release:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
@@ -377,30 +398,30 @@ retry:
BUG_ON(dst_addr >= dst_start + len);
/*
- * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
- * i_mmap_rwsem ensures the dst_pte remains valid even
+ * Serialize via vma_lock and hugetlb_fault_mutex.
+ * vma_lock ensures the dst_pte remains valid even
* in the case of shared pmds. fault mutex prevents
* races with other faulting threads.
*/
- mapping = dst_vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(dst_vma);
err = -ENOMEM;
dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
if (mode != MCOPY_ATOMIC_CONTINUE &&
!huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -408,8 +429,8 @@ retry:
dst_addr, src_addr, mode, &page,
wp_copy);
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
cond_resched();
@@ -611,7 +632,7 @@ retry:
break;
}
- dst_pmdval = pmd_read_atomic(dst_pmd);
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
/*
* If the dst_pmd is mapped as THP don't
* override it and just be strict.
@@ -644,11 +665,11 @@ retry:
mmap_read_unlock(dst_mm);
BUG_ON(!page);
- page_kaddr = kmap(page);
+ page_kaddr = kmap_local_page(page);
err = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap(page);
+ kunmap_local(page_kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
diff --git a/mm/util.c b/mm/util.c
index c9439c66d8cf..b56c92fb910f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -272,38 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user_nul);
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- next = mm->mmap;
- mm->mmap = vma;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev, *next;
-
- next = vma->vm_next;
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- if (next)
- next->vm_prev = prev;
-}
-
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
@@ -619,6 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
/* Don't even allow crazy sizes */
if (unlikely(size > INT_MAX)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
@@ -745,32 +717,6 @@ void *page_rmapping(struct page *page)
return folio_raw_mapping(page_folio(page));
}
-/**
- * folio_mapped - Is this folio mapped into userspace?
- * @folio: The folio.
- *
- * Return: True if any page in this folio is referenced by user page tables.
- */
-bool folio_mapped(struct folio *folio)
-{
- long i, nr;
-
- if (!folio_test_large(folio))
- return atomic_read(&folio->_mapcount) >= 0;
- if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
- return true;
- if (folio_test_hugetlb(folio))
- return false;
-
- nr = folio_nr_pages(folio);
- for (i = 0; i < nr; i++) {
- if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
- return true;
- }
- return false;
-}
-EXPORT_SYMBOL(folio_mapped);
-
struct anon_vma *folio_anon_vma(struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
@@ -811,59 +757,6 @@ struct address_space *folio_mapping(struct folio *folio)
}
EXPORT_SYMBOL(folio_mapping);
-/* Slow path of page_mapcount() for compound pages */
-int __page_mapcount(struct page *page)
-{
- int ret;
-
- ret = atomic_read(&page->_mapcount) + 1;
- /*
- * For file THP page->_mapcount contains total number of mapping
- * of the page: no need to look into compound_mapcount.
- */
- if (!PageAnon(page) && !PageHuge(page))
- return ret;
- page = compound_head(page);
- ret += atomic_read(compound_mapcount_ptr(page)) + 1;
- if (PageDoubleMap(page))
- ret--;
- return ret;
-}
-EXPORT_SYMBOL_GPL(__page_mapcount);
-
-/**
- * folio_mapcount() - Calculate the number of mappings of this folio.
- * @folio: The folio.
- *
- * A large folio tracks both how many times the entire folio is mapped,
- * and how many times each individual page in the folio is mapped.
- * This function calculates the total number of times the folio is
- * mapped.
- *
- * Return: The number of times this folio is mapped.
- */
-int folio_mapcount(struct folio *folio)
-{
- int i, compound, nr, ret;
-
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) + 1;
-
- compound = folio_entire_mapcount(folio);
- nr = folio_nr_pages(folio);
- if (folio_test_hugetlb(folio))
- return compound;
- ret = compound;
- for (i = 0; i < nr; i++)
- ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
- /* File pages has compound_mapcount included in _mapcount */
- if (!folio_test_anon(folio))
- return ret - compound * nr;
- if (folio_test_double_map(folio))
- ret -= nr;
- return ret;
-}
-
/**
* folio_copy - Copy the contents of one folio to another.
* @dst: Folio to copy to.
@@ -1052,6 +945,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
+ pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n",
+ __func__, current->pid, current->comm);
vm_unacct_memory(pages);
return -ENOMEM;
diff --git a/mm/vmacache.c b/mm/vmacache.c
deleted file mode 100644
index 01a6e6688ec1..000000000000
--- a/mm/vmacache.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2014 Davidlohr Bueso.
- */
-#include <linux/sched/signal.h>
-#include <linux/sched/task.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-
-/*
- * Hash based on the pmd of addr if configured with MMU, which provides a good
- * hit rate for workloads with spatial locality. Otherwise, use pages.
- */
-#ifdef CONFIG_MMU
-#define VMACACHE_SHIFT PMD_SHIFT
-#else
-#define VMACACHE_SHIFT PAGE_SHIFT
-#endif
-#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
-
-/*
- * This task may be accessing a foreign mm via (for example)
- * get_user_pages()->find_vma(). The vmacache is task-local and this
- * task's vmacache pertains to a different mm (ie, its own). There is
- * nothing we can do here.
- *
- * Also handle the case where a kernel thread has adopted this mm via
- * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
- */
-static inline bool vmacache_valid_mm(struct mm_struct *mm)
-{
- return current->mm == mm && !(current->flags & PF_KTHREAD);
-}
-
-void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
-{
- if (vmacache_valid_mm(newvma->vm_mm))
- current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
-}
-
-static bool vmacache_valid(struct mm_struct *mm)
-{
- struct task_struct *curr;
-
- if (!vmacache_valid_mm(mm))
- return false;
-
- curr = current;
- if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
- /*
- * First attempt will always be invalid, initialize
- * the new cache for this task here.
- */
- curr->vmacache.seqnum = mm->vmacache_seqnum;
- vmacache_flush(curr);
- return false;
- }
- return true;
-}
-
-struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
-{
- int idx = VMACACHE_HASH(addr);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma) {
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
-#endif
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-
-#ifndef CONFIG_MMU
-struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
- int idx = VMACACHE_HASH(start);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma && vma->vm_start == start && vma->vm_end == end) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index dd6cdb201195..ca71de7c9d77 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -43,6 +43,9 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmalloc.h>
+
#include "internal.h"
#include "pgalloc-track.h"
@@ -320,6 +323,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
ioremap_max_page_shift);
flush_cache_vmap(addr, end);
+ if (!err)
+ kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+ ioremap_max_page_shift);
return err;
}
@@ -416,7 +422,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
*
* This is an internal function only. Do not use outside mm/.
*/
-void vunmap_range_noflush(unsigned long start, unsigned long end)
+void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
unsigned long next;
pgd_t *pgd;
@@ -438,6 +444,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end)
arch_sync_kernel_mappings(start, end);
}
+void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ kmsan_vunmap_range_noflush(start, end);
+ __vunmap_range_noflush(start, end);
+}
+
/**
* vunmap_range - unmap kernel virtual addresses
* @addr: start of the VM area to unmap
@@ -575,7 +587,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
*
* This is an internal function only. Do not use outside mm/.
*/
-int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
@@ -590,7 +602,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
- __pa(page_address(pages[i])), prot,
+ page_to_phys(pages[i]), prot,
page_shift);
if (err)
return err;
@@ -601,6 +613,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
return 0;
}
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+}
+
/**
* vmap_pages_range - map pages to a kernel virtual address
* @addr: start of the VM area to map
@@ -1300,12 +1319,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
#include <linux/random.h>
static struct vmap_area *
-find_vmap_lowest_linear_match(unsigned long size,
+find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
- list_for_each_entry(va, &free_vmap_area_list, list) {
+ list_for_each_entry(va, head, list) {
if (!is_within_this_va(va, size, align, vstart))
continue;
@@ -1316,7 +1335,8 @@ find_vmap_lowest_linear_match(unsigned long size,
}
static void
-find_vmap_lowest_match_check(unsigned long size, unsigned long align)
+find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
@@ -1325,8 +1345,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, align, vstart, false);
- va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+ va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
+ va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1513,7 +1533,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(size, align);
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1603,6 +1623,8 @@ retry:
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
@@ -1708,6 +1730,7 @@ static void purge_fragmented_blocks_allcpus(void);
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
+ unsigned int num_purged_areas = 0;
struct list_head local_purge_list;
struct vmap_area *va, *n_va;
@@ -1719,7 +1742,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
spin_unlock(&purge_vmap_area_lock);
if (unlikely(list_empty(&local_purge_list)))
- return false;
+ goto out;
start = min(start,
list_first_entry(&local_purge_list,
@@ -1754,12 +1777,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
+ num_purged_areas++;
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
cond_resched_lock(&free_vmap_area_lock);
}
spin_unlock(&free_vmap_area_lock);
- return true;
+
+out:
+ trace_purge_vmap_area_lazy(start, end, num_purged_areas);
+ return num_purged_areas > 0;
}
/*
@@ -1794,6 +1821,8 @@ static void drain_vmap_area_work(struct work_struct *work)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
+ unsigned long nr_lazy_max = lazy_max_pages();
+ unsigned long va_start = va->va_start;
unsigned long nr_lazy;
spin_lock(&vmap_area_lock);
@@ -1811,8 +1840,10 @@ static void free_vmap_area_noflush(struct vmap_area *va)
&purge_vmap_area_root, &purge_vmap_area_list);
spin_unlock(&purge_vmap_area_lock);
+ trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
+
/* After this point, we may free va at any time */
- if (unlikely(nr_lazy > lazy_max_pages()))
+ if (unlikely(nr_lazy > nr_lazy_max))
schedule_work(&drain_vmap_work);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2b1431352dc..bd6637fcd8f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -43,12 +43,18 @@
#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/memory-tiers.h>
#include <linux/oom.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/khugepaged.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -85,7 +91,7 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
- /* Can active pages be deactivated as part of reclaim? */
+ /* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
unsigned int may_deactivate:2;
@@ -95,10 +101,10 @@ struct scan_control {
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
- /* Can mapped pages be reclaimed? */
+ /* Can mapped folios be reclaimed? */
unsigned int may_unmap:1;
- /* Can pages be swapped as part of reclaim? */
+ /* Can folios be swapped as part of reclaim? */
unsigned int may_swap:1;
/* Proactive reclaim invoked by userspace through memory.reclaim */
@@ -123,19 +129,25 @@ struct scan_control {
/* There is easily reclaimable cold cache in the current node */
unsigned int cache_trim_mode:1;
- /* The file pages on the current node are dangerously low */
+ /* The file folios on the current node are dangerously low */
unsigned int file_is_tiny:1;
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+#ifdef CONFIG_LRU_GEN
+ /* help kswapd make better choices among multiple memcgs */
+ unsigned int memcgs_need_aging:1;
+ unsigned long last_reclaimed;
+#endif
+
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
- /* The highest zone to isolate pages for reclaim from */
+ /* The highest zone to isolate folios for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
@@ -443,7 +455,7 @@ static bool cgroup_reclaim(struct scan_control *sc)
*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
* completely broken with the legacy memcg and direct stalling in
- * shrink_page_list() is used for throttling instead, which lacks all the
+ * shrink_folio_list() is used for throttling instead, which lacks all the
* niceties such as fairness, adaptive pausing, bandwidth proportional
* allocation and configurability.
*
@@ -564,9 +576,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
}
/*
- * This misses isolated pages which are not accounted for to save counters.
+ * This misses isolated folios which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
- * not expected that isolated pages will be a dominating factor.
+ * not expected that isolated folios will be a dominating factor.
*/
unsigned long zone_reclaimable_pages(struct zone *zone)
{
@@ -1009,39 +1021,60 @@ out:
return freed;
}
-static void drop_slab_node(int nid)
+static unsigned long drop_slab_node(int nid)
{
- unsigned long freed;
- int shift = 0;
+ unsigned long freed = 0;
+ struct mem_cgroup *memcg = NULL;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct mem_cgroup *memcg = NULL;
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- if (fatal_signal_pending(current))
- return;
+ return freed;
+}
+void drop_slab(void)
+{
+ int nid;
+ int shift = 0;
+ unsigned long freed;
+
+ do {
freed = 0;
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ for_each_online_node(nid) {
+ if (fatal_signal_pending(current))
+ return;
+
+ freed += drop_slab_node(nid);
+ }
} while ((freed >> shift++) > 1);
}
-void drop_slab(void)
+static int reclaimer_offset(void)
{
- int nid;
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGSCAN_DIRECT - PGSCAN_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
- for_each_online_node(nid)
- drop_slab_node(nid);
+ if (current_is_kswapd())
+ return 0;
+ if (current_is_khugepaged())
+ return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
static inline int is_page_cache_freeable(struct folio *folio)
{
/*
- * A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache and optional buffer
- * heads at page->private.
+ * A freeable page cache folio is referenced only by the caller
+ * that isolated the folio, the page cache and optional filesystem
+ * private data at folio->private.
*/
return folio_ref_count(folio) - folio_test_private(folio) ==
1 + folio_nr_pages(folio);
@@ -1081,8 +1114,8 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
return true;
/*
- * If there are a lot of dirty/writeback pages then do not
- * throttle as throttling will occur when the pages cycle
+ * If there are a lot of dirty/writeback folios then do not
+ * throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1125,7 +1158,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
* short. Failing to make progress or waiting on writeback are
* potentially long-lived events so use a longer timeout. This is shaky
* logic as a failure to make progress could be due to anything from
- * writeback to a slow device to excessive references pages at the tail
+ * writeback to a slow device to excessive referenced folios at the tail
* of the inactive LRU.
*/
switch(reason) {
@@ -1171,8 +1204,8 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
}
/*
- * Account for pages written if tasks are throttled waiting on dirty
- * pages to clean. If enough pages have been cleaned since throttling
+ * Account for folios written if tasks are throttled waiting on dirty
+ * folios to clean. If enough folios have been cleaned since throttling
* started then wakeup the throttled tasks.
*/
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
@@ -1198,18 +1231,18 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
/* possible outcome of pageout() */
typedef enum {
- /* failed to write page out, page is locked */
+ /* failed to write folio out, folio is locked */
PAGE_KEEP,
- /* move page to the active list, page is locked */
+ /* move folio to the active list, folio is locked */
PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
+ /* folio has been sent to the disk successfully, folio is unlocked */
PAGE_SUCCESS,
- /* page is clean and locked */
+ /* folio is clean and locked */
PAGE_CLEAN,
} pageout_t;
/*
- * pageout is called by shrink_page_list() for each dirty page.
+ * pageout is called by shrink_folio_list() for each dirty folio.
* Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
@@ -1283,7 +1316,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
/*
- * Same as remove_mapping, but if the page is removed from the mapping, it
+ * Same as remove_mapping, but if the folio is removed from the mapping, it
* gets returned with a refcount of 0.
*/
static int __remove_mapping(struct address_space *mapping, struct folio *folio,
@@ -1299,34 +1332,34 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
- * The non racy check for a busy page.
+ * The non racy check for a busy folio.
*
* Must be careful with the order of the tests. When someone has
- * a ref to the page, it may be possible that they dirty it then
- * drop the reference. So if PageDirty is tested before page_count
- * here, then the following race may occur:
+ * a ref to the folio, it may be possible that they dirty it then
+ * drop the reference. So if the dirty flag is tested before the
+ * refcount here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
- * !PageDirty(page) [good]
- * SetPageDirty(page);
- * put_page(page);
- * !page_count(page) [good, discard it]
+ * !folio_test_dirty(folio) [good]
+ * folio_set_dirty(folio);
+ * folio_put(folio);
+ * !refcount(folio) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
- * escape unnoticed. The smp_rmb is needed to ensure the page->flags
- * load is not satisfied before that of page->_refcount.
+ * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
+ * load is not satisfied before that of folio->_refcount.
*
- * Note that if SetPageDirty is always performed via set_page_dirty,
+ * Note that if the dirty flag is always set via folio_mark_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
refcount = 1 + folio_nr_pages(folio);
if (!folio_ref_freeze(folio, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
+ /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
if (unlikely(folio_test_dirty(folio))) {
folio_ref_unfreeze(folio, refcount);
goto cannot_free;
@@ -1334,12 +1367,13 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio_swap_entry(folio);
- mem_cgroup_swapout(folio, swap);
+
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
+ mem_cgroup_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
- put_swap_page(&folio->page, swap);
+ put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -1355,7 +1389,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
* back.
*
* We also don't store shadows for DAX mappings because the
- * only page cache pages found in these are zero pages
+ * only page cache folios found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
* same address_space.
@@ -1423,14 +1457,14 @@ void folio_putback_lru(struct folio *folio)
folio_put(folio); /* drop ref from isolate */
}
-enum page_references {
- PAGEREF_RECLAIM,
- PAGEREF_RECLAIM_CLEAN,
- PAGEREF_KEEP,
- PAGEREF_ACTIVATE,
+enum folio_references {
+ FOLIOREF_RECLAIM,
+ FOLIOREF_RECLAIM_CLEAN,
+ FOLIOREF_KEEP,
+ FOLIOREF_ACTIVATE,
};
-static enum page_references folio_check_references(struct folio *folio,
+static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
@@ -1445,11 +1479,11 @@ static enum page_references folio_check_references(struct folio *folio,
* Let the folio, now marked Mlocked, be moved to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
- return PAGEREF_ACTIVATE;
+ return FOLIOREF_ACTIVATE;
/* rmap lock contention: rotate */
if (referenced_ptes == -1)
- return PAGEREF_KEEP;
+ return FOLIOREF_KEEP;
if (referenced_ptes) {
/*
@@ -1469,34 +1503,34 @@ static enum page_references folio_check_references(struct folio *folio,
folio_set_referenced(folio);
if (referenced_folio || referenced_ptes > 1)
- return PAGEREF_ACTIVATE;
+ return FOLIOREF_ACTIVATE;
/*
* Activate file-backed executable folios after first usage.
*/
if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
- return PAGEREF_ACTIVATE;
+ return FOLIOREF_ACTIVATE;
- return PAGEREF_KEEP;
+ return FOLIOREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */
if (referenced_folio && folio_is_file_lru(folio))
- return PAGEREF_RECLAIM_CLEAN;
+ return FOLIOREF_RECLAIM_CLEAN;
- return PAGEREF_RECLAIM;
+ return FOLIOREF_RECLAIM;
}
-/* Check if a page is dirty or under writeback */
+/* Check if a folio is dirty or under writeback */
static void folio_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
/*
- * Anonymous pages are not handled by flushers and must be written
+ * Anonymous folios are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them.
- * MADV_FREE anonymous pages are put into inactive file list too.
+ * MADV_FREE anonymous folios are put into inactive file list too.
* They could be mistakenly treated as file lru. So further anon
* test is needed.
*/
@@ -1520,49 +1554,73 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-static struct page *alloc_demote_page(struct page *page, unsigned long node)
+static struct page *alloc_demote_page(struct page *page, unsigned long private)
{
- struct migration_target_control mtc = {
- /*
- * Allocate from 'node', or fail quickly and quietly.
- * When this happens, 'page' will likely just be discarded
- * instead of migrated.
- */
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_THISNODE | __GFP_NOWARN |
- __GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = node
- };
+ struct page *target_page;
+ nodemask_t *allowed_mask;
+ struct migration_target_control *mtc;
- return alloc_migration_target(page, (unsigned long)&mtc);
+ mtc = (struct migration_target_control *)private;
+
+ allowed_mask = mtc->nmask;
+ /*
+ * make sure we allocate from the target node first also trying to
+ * demote or reclaim pages from the target node via kswapd if we are
+ * low on free memory on target node. If we don't do this and if
+ * we have free memory on the slower(lower) memtier, we would start
+ * allocating pages from slower(lower) memory tiers without even forcing
+ * a demotion of cold pages from the target memtier. This can result
+ * in the kernel placing hot pages in slower(lower) memory tiers.
+ */
+ mtc->nmask = NULL;
+ mtc->gfp_mask |= __GFP_THISNODE;
+ target_page = alloc_migration_target(page, (unsigned long)mtc);
+ if (target_page)
+ return target_page;
+
+ mtc->gfp_mask &= ~__GFP_THISNODE;
+ mtc->nmask = allowed_mask;
+
+ return alloc_migration_target(page, (unsigned long)mtc);
}
/*
- * Take pages on @demote_list and attempt to demote them to
- * another node. Pages which are not demoted are left on
- * @demote_pages.
+ * Take folios on @demote_folios and attempt to demote them to another node.
+ * Folios which are not demoted are left on @demote_folios.
*/
-static unsigned int demote_page_list(struct list_head *demote_pages,
+static unsigned int demote_folio_list(struct list_head *demote_folios,
struct pglist_data *pgdat)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
+ nodemask_t allowed_mask;
- if (list_empty(demote_pages))
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = target_nid,
+ .nmask = &allowed_mask
+ };
+
+ if (list_empty(demote_folios))
return 0;
if (target_nid == NUMA_NO_NODE)
return 0;
+ node_get_allowed_targets(pgdat, &allowed_mask);
+
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_pages, alloc_demote_page, NULL,
- target_nid, MIGRATE_ASYNC, MR_DEMOTION,
- &nr_succeeded);
+ migrate_pages(demote_folios, alloc_demote_page, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
- if (current_is_kswapd())
- __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
- else
- __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+ __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
return nr_succeeded;
}
@@ -1584,17 +1642,15 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
}
/*
- * shrink_page_list() returns the number of reclaimed pages
+ * shrink_folio_list() returns the number of reclaimed pages
*/
-static unsigned int shrink_page_list(struct list_head *page_list,
- struct pglist_data *pgdat,
- struct scan_control *sc,
- struct reclaim_stat *stat,
- bool ignore_references)
-{
- LIST_HEAD(ret_pages);
- LIST_HEAD(free_pages);
- LIST_HEAD(demote_pages);
+static unsigned int shrink_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat, struct scan_control *sc,
+ struct reclaim_stat *stat, bool ignore_references)
+{
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(free_folios);
+ LIST_HEAD(demote_folios);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
@@ -1605,16 +1661,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
do_demote_pass = can_demote(pgdat->node_id, sc);
retry:
- while (!list_empty(page_list)) {
+ while (!list_empty(folio_list)) {
struct address_space *mapping;
struct folio *folio;
- enum page_references references = PAGEREF_RECLAIM;
+ enum folio_references references = FOLIOREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
- folio = lru_to_folio(page_list);
+ folio = lru_to_folio(folio_list);
list_del(&folio->lru);
if (!folio_trylock(folio))
@@ -1633,6 +1689,11 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ folio_mapped(folio) && folio_test_referenced(folio))
+ goto keep_locked;
+
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1733,7 +1794,7 @@ retry:
folio_unlock(folio);
folio_wait_writeback(folio);
/* then go back and try same folio again */
- list_add_tail(&folio->lru, page_list);
+ list_add_tail(&folio->lru, folio_list);
continue;
}
}
@@ -1742,13 +1803,13 @@ retry:
references = folio_check_references(folio, sc);
switch (references) {
- case PAGEREF_ACTIVATE:
+ case FOLIOREF_ACTIVATE:
goto activate_locked;
- case PAGEREF_KEEP:
+ case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
- case PAGEREF_RECLAIM:
- case PAGEREF_RECLAIM_CLEAN:
+ case FOLIOREF_RECLAIM:
+ case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
}
@@ -1758,7 +1819,7 @@ retry:
*/
if (do_demote_pass &&
(thp_migration_supported() || !folio_test_large(folio))) {
- list_add(&folio->lru, &demote_pages);
+ list_add(&folio->lru, &demote_folios);
folio_unlock(folio);
continue;
}
@@ -1785,7 +1846,7 @@ retry:
*/
if (!folio_entire_mapcount(folio) &&
split_folio_to_list(folio,
- page_list))
+ folio_list))
goto activate_locked;
}
if (!add_to_swap(folio)) {
@@ -1793,7 +1854,7 @@ retry:
goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_folio_to_list(folio,
- page_list))
+ folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
@@ -1805,7 +1866,7 @@ retry:
} else if (folio_test_swapbacked(folio) &&
folio_test_large(folio)) {
/* Split shmem folio */
- if (split_folio_to_list(folio, page_list))
+ if (split_folio_to_list(folio, folio_list))
goto keep_locked;
}
@@ -1870,7 +1931,7 @@ retry:
goto activate_locked;
}
- if (references == PAGEREF_RECLAIM_CLEAN)
+ if (references == FOLIOREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs(folio, sc->gfp_mask))
goto keep_locked;
@@ -1983,13 +2044,13 @@ free_it:
nr_reclaimed += nr_pages;
/*
- * Is there need to periodically free_page_list? It would
+ * Is there need to periodically free_folio_list? It would
* appear not as the counts should be low
*/
if (unlikely(folio_test_large(folio)))
destroy_large_folio(folio);
else
- list_add(&folio->lru, &free_pages);
+ list_add(&folio->lru, &free_folios);
continue;
activate_locked_split:
@@ -2004,9 +2065,8 @@ activate_locked_split:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (folio_test_swapcache(folio) &&
- (mem_cgroup_swap_full(&folio->page) ||
- folio_test_mlocked(folio)))
- try_to_free_swap(&folio->page);
+ (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
+ folio_free_swap(folio);
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
if (!folio_test_mlocked(folio)) {
int type = folio_is_file_lru(folio);
@@ -2017,29 +2077,48 @@ activate_locked:
keep_locked:
folio_unlock(folio);
keep:
- list_add(&folio->lru, &ret_pages);
+ list_add(&folio->lru, &ret_folios);
VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
folio_test_unevictable(folio), folio);
}
- /* 'page_list' is always empty here */
+ /* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_reclaimed += demote_page_list(&demote_pages, pgdat);
- /* Folios that could not be demoted are still in @demote_pages */
- if (!list_empty(&demote_pages)) {
- /* Folios which weren't demoted go back on @page_list for retry: */
- list_splice_init(&demote_pages, page_list);
- do_demote_pass = false;
- goto retry;
+ nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
+ /* Folios that could not be demoted are still in @demote_folios */
+ if (!list_empty(&demote_folios)) {
+ /* Folios which weren't demoted go back on @folio_list */
+ list_splice_init(&demote_folios, folio_list);
+
+ /*
+ * goto retry to reclaim the undemoted folios in folio_list if
+ * desired.
+ *
+ * Reclaiming directly from top tier nodes is not often desired
+ * due to it breaking the LRU ordering: in general memory
+ * should be reclaimed from lower tier nodes and demoted from
+ * top tier nodes.
+ *
+ * However, disabling reclaim from top tier nodes entirely
+ * would cause ooms in edge scenarios where lower tier memory
+ * is unreclaimable for whatever reason, eg memory being
+ * mlocked or too hot to reclaim. We can disable reclaim
+ * from top tier nodes in proactive reclaim though as that is
+ * not real memory pressure.
+ */
+ if (!sc->proactive) {
+ do_demote_pass = false;
+ goto retry;
+ }
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
- mem_cgroup_uncharge_list(&free_pages);
+ mem_cgroup_uncharge_list(&free_folios);
try_to_unmap_flush();
- free_unref_page_list(&free_pages);
+ free_unref_page_list(&free_folios);
- list_splice(&ret_pages, page_list);
+ list_splice(&ret_folios, folio_list);
count_vm_events(PGACTIVATE, pgactivate);
if (plug)
@@ -2048,7 +2127,7 @@ keep:
}
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
- struct list_head *folio_list)
+ struct list_head *folio_list)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -2076,7 +2155,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
* change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
- nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
+ nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
&stat, true);
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2135,7 +2214,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
*
* returns how many pages were moved onto *@dst.
*/
-static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
enum lru_list lru)
@@ -2242,8 +2321,8 @@ move:
*
* Context:
*
- * (1) Must be called with an elevated refcount on the page. This is a
- * fundamental difference from isolate_lru_pages() (which is called
+ * (1) Must be called with an elevated refcount on the folio. This is a
+ * fundamental difference from isolate_lru_folios() (which is called
* without a stable reference).
* (2) The lru_lock must not be held.
* (3) Interrupts must be enabled.
@@ -2315,13 +2394,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
}
/*
- * move_pages_to_lru() moves folios from private @list to appropriate LRU list.
+ * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
* On return, @list is reused as a list of folios to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
-static unsigned int move_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_folios_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
int nr_pages, nr_moved = 0;
LIST_HEAD(folios_to_free);
@@ -2341,7 +2420,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
/*
* The folio_set_lru needs to be kept here for list integrity.
* Otherwise:
- * #0 move_pages_to_lru #1 release_pages
+ * #0 move_folios_to_lru #1 release_pages
* if (!folio_put_testzero())
* if (folio_put_testzero())
* !lru //skip lru_lock
@@ -2398,11 +2477,11 @@ static int current_may_throttle(void)
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
-static unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
- struct scan_control *sc, enum lru_list lru)
+static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct scan_control *sc,
+ enum lru_list lru)
{
- LIST_HEAD(page_list);
+ LIST_HEAD(folio_list);
unsigned long nr_scanned;
unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
@@ -2429,11 +2508,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&lruvec->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ item = PGSCAN_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@@ -2444,36 +2523,48 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
spin_lock_irq(&lruvec->lru_lock);
- move_pages_to_lru(lruvec, &page_list);
+ move_folios_to_lru(lruvec, &folio_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout);
- mem_cgroup_uncharge_list(&page_list);
- free_unref_page_list(&page_list);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ mem_cgroup_uncharge_list(&folio_list);
+ free_unref_page_list(&folio_list);
/*
- * If dirty pages are scanned that are not queued for IO, it
+ * If dirty folios are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
- * happen when memory pressure pushes dirty pages to the end of
+ * happen when memory pressure pushes dirty folios to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
- * dirty pages grows not through writes but through memory
+ * dirty folios grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
- if (stat.nr_unqueued_dirty == nr_taken)
+ if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
sc->nr.dirty += stat.nr_dirty;
sc->nr.congested += stat.nr_congested;
@@ -2526,7 +2617,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_lock_irq(&lruvec->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
@@ -2550,8 +2641,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (unlikely(buffer_heads_over_limit)) {
- if (folio_get_private(folio) && folio_trylock(folio)) {
- if (folio_get_private(folio))
+ if (folio_test_private(folio) && folio_trylock(folio)) {
+ if (folio_test_private(folio))
filemap_release_folio(folio, 0);
folio_unlock(folio);
}
@@ -2586,8 +2677,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
spin_lock_irq(&lruvec->lru_lock);
- nr_activate = move_pages_to_lru(lruvec, &l_active);
- nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ nr_activate = move_folios_to_lru(lruvec, &l_active);
+ nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
/* Keep all free folios in l_active list */
list_splice(&l_inactive, &l_active);
@@ -2597,13 +2688,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
-static unsigned int reclaim_page_list(struct list_head *page_list,
+static unsigned int reclaim_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat)
{
struct reclaim_stat dummy_stat;
@@ -2617,9 +2710,9 @@ static unsigned int reclaim_page_list(struct list_head *page_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false);
- while (!list_empty(page_list)) {
- folio = lru_to_folio(page_list);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+ while (!list_empty(folio_list)) {
+ folio = lru_to_folio(folio_list);
list_del(&folio->lru);
folio_putback_lru(folio);
}
@@ -2649,11 +2742,11 @@ unsigned long reclaim_pages(struct list_head *folio_list)
continue;
}
- nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
nid = folio_nid(lru_to_folio(folio_list));
} while (!list_empty(folio_list));
- nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2683,13 +2776,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
* but large enough to avoid thrashing the aggregate readahead window.
*
* Both inactive lists should also be large enough that each inactive
- * page has a chance to be referenced again before it is reclaimed.
+ * folio has a chance to be referenced again before it is reclaimed.
*
* If that fails and refaulting is observed, the inactive list grows.
*
- * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
* on this LRU, maintained by the pageout code. An inactive_ratio
- * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
+ * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
*
* total target max
* memory ratio inactive
@@ -2728,12 +2821,118 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ if (lru_gen_enabled())
+ return;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
*
- * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
- * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
+ * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
+ * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
@@ -2748,7 +2947,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long ap, fp;
enum lru_list lru;
- /* If we have no swap space, do not bother scanning anon pages. */
+ /* If we have no swap space, do not bother scanning anon folios. */
if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
@@ -2947,6 +3146,2762 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
+#endif
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+
+#define DEFINE_MAX_SEQ(lruvec) \
+ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
+
+#define DEFINE_MIN_SEQ(lruvec) \
+ unsigned long min_seq[ANON_AND_FILE] = { \
+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
+ }
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* see the comment in mem_cgroup_lruvec() */
+ if (!lruvec->pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return &pgdat->__lruvec;
+}
+
+static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ if (!can_demote(pgdat->node_id, sc) &&
+ mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
+ return 0;
+
+ return mem_cgroup_swappiness(memcg);
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+ /* see the comment on lru_gen_struct */
+ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
+ get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+ static struct lru_gen_mm_list mm_list = {
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+ };
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ return &memcg->mm_list;
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
+#ifdef CONFIG_MEMCG
+ VM_WARN_ON_ONCE(mm->lru_gen.memcg);
+ mm->lru_gen.memcg = memcg;
+#endif
+ spin_lock(&mm_list->lock);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ /* the first addition since the last iteration */
+ if (lruvec->mm_state.tail == &mm_list->fifo)
+ lruvec->mm_state.tail = &mm->lru_gen.list;
+ }
+
+ list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
+
+ spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct lru_gen_mm_list *mm_list;
+ struct mem_cgroup *memcg = NULL;
+
+ if (list_empty(&mm->lru_gen.list))
+ return;
+
+#ifdef CONFIG_MEMCG
+ memcg = mm->lru_gen.memcg;
+#endif
+ mm_list = get_mm_list(memcg);
+
+ spin_lock(&mm_list->lock);
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ /* where the last iteration ended (exclusive) */
+ if (lruvec->mm_state.tail == &mm->lru_gen.list)
+ lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+
+ /* where the current iteration continues (inclusive) */
+ if (lruvec->mm_state.head != &mm->lru_gen.list)
+ continue;
+
+ lruvec->mm_state.head = lruvec->mm_state.head->next;
+ /* the deletion ends the current iteration */
+ if (lruvec->mm_state.head == &mm_list->fifo)
+ WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
+ }
+
+ list_del_init(&mm->lru_gen.list);
+
+ spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_put(mm->lru_gen.memcg);
+ mm->lru_gen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+ struct task_struct *task = rcu_dereference_protected(mm->owner, true);
+
+ VM_WARN_ON_ONCE(task->mm != mm);
+ lockdep_assert_held(&task->alloc_lock);
+
+ /* for mm_update_next_owner() */
+ if (mem_cgroup_disabled())
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(task);
+ rcu_read_unlock();
+ if (memcg == mm->lru_gen.memcg)
+ return;
+
+ VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
+ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
+
+ lru_gen_del_mm(mm);
+ lru_gen_add_mm(mm);
+}
+#endif
+
+/*
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+ * bits in a bitmap, k is the number of hash functions and n is the number of
+ * inserted items.
+ *
+ * Page table walkers use one of the two filters to reduce their search space.
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
+ * aging uses the double-buffering technique to flip to the other filter each
+ * time it produces a new generation. For non-leaf entries that have enough
+ * leaf entries, the aging carries them over to the next generation in
+ * walk_pmd_range(); the eviction also report them when walking the rmap
+ * in lru_gen_look_around().
+ *
+ * For future optimizations:
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
+ * freed after the RCU grace period and reallocated if needed again.
+ * 2. And when reallocating, it's worth scaling its size according to the number
+ * of inserted entries in the other filter, to reduce the memory overhead on
+ * small systems and false positives on large systems.
+ * 3. Jenkins' hash function is an alternative to Knuth's.
+ */
+#define BLOOM_FILTER_SHIFT 15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+ return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = lruvec->mm_state.filters[gen];
+ if (filter) {
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+ return;
+ }
+
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+}
+
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return;
+
+ get_item_key(item, key);
+
+ if (!test_bit(key[0], filter))
+ set_bit(key[0], filter);
+ if (!test_bit(key[1], filter))
+ set_bit(key[1], filter);
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return true;
+
+ get_item_key(item, key);
+
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+{
+ int i;
+ int hist;
+
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+ if (walk) {
+ hist = lru_hist_from_seq(walk->max_seq);
+
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i],
+ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+ walk->mm_stats[i] = 0;
+ }
+ }
+
+ if (NR_HIST_GENS > 1 && last) {
+ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+
+ for (i = 0; i < NR_MM_STATS; i++)
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+ }
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ int type;
+ unsigned long size = 0;
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+ return true;
+
+ clear_bit(key, &mm->lru_gen.bitmap);
+
+ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ }
+
+ if (size < MIN_LRU_BATCH)
+ return true;
+
+ return !mmget_not_zero(mm);
+}
+
+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
+ struct mm_struct **iter)
+{
+ bool first = false;
+ bool last = true;
+ struct mm_struct *mm = NULL;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+ /*
+ * There are four interesting cases for this page table walker:
+ * 1. It tries to start a new iteration of mm_list with a stale max_seq;
+ * there is nothing left to do.
+ * 2. It's the first of the current generation, and it needs to reset
+ * the Bloom filter for the next generation.
+ * 3. It reaches the end of mm_list, and it needs to increment
+ * mm_state->seq; the iteration is done.
+ * 4. It's the last of the current generation, and it needs to reset the
+ * mm stats counters for the next generation.
+ */
+ spin_lock(&mm_list->lock);
+
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+ VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
+ VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
+
+ if (walk->max_seq <= mm_state->seq) {
+ if (!*iter)
+ last = false;
+ goto done;
+ }
+
+ if (!mm_state->nr_walkers) {
+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+
+ mm_state->head = mm_list->fifo.next;
+ first = true;
+ }
+
+ while (!mm && mm_state->head != &mm_list->fifo) {
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+
+ mm_state->head = mm_state->head->next;
+
+ /* force scan for those added after the last iteration */
+ if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
+ mm_state->tail = mm_state->head;
+ walk->force_scan = true;
+ }
+
+ if (should_skip_mm(mm, walk))
+ mm = NULL;
+ }
+
+ if (mm_state->head == &mm_list->fifo)
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+done:
+ if (*iter && !mm)
+ mm_state->nr_walkers--;
+ if (!*iter && mm)
+ mm_state->nr_walkers++;
+
+ if (mm_state->nr_walkers)
+ last = false;
+
+ if (*iter || last)
+ reset_mm_stats(lruvec, walk, last);
+
+ spin_unlock(&mm_list->lock);
+
+ if (mm && first)
+ reset_bloom_filter(lruvec, walk->max_seq + 1);
+
+ if (*iter)
+ mmput_async(*iter);
+
+ *iter = mm;
+
+ return last;
+}
+
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+{
+ bool success = false;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+ spin_lock(&mm_list->lock);
+
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+
+ if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ reset_mm_stats(lruvec, NULL, true);
+ success = true;
+ }
+
+ spin_unlock(&mm_list->lock);
+
+ return success;
+}
+
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
+ *
+ * The P term is refaulted/(evicted+protected) from a tier in the generation
+ * currently being evicted; the I term is the exponential moving average of the
+ * P term over the generations previously evicted, using the smoothing factor
+ * 1/2; the D term isn't supported.
+ *
+ * The setpoint (SP) is always the first tier of one type; the process variable
+ * (PV) is either any tier of the other type or any other tier of the same
+ * type.
+ *
+ * The error is the difference between the SP and the PV; the correction is to
+ * turn off protection when SP>PV or turn on protection when SP<PV.
+ *
+ * For future optimizations:
+ * 1. The D term may discount the other two terms over time so that long-lived
+ * generations can resist stale information.
+ */
+struct ctrl_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+ struct ctrl_pos *pos)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->protected[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
+{
+ int hist, tier;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
+ unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ if (!carryover && !clear)
+ return;
+
+ hist = lru_hist_from_seq(seq);
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->protected[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+ }
+
+ if (clear) {
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ }
+ }
+}
+
+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
+{
+ /*
+ * Return true if the PV has a limited number of refaults or a lower
+ * refaulted/total than the SP.
+ */
+ return pv->refaulted < MIN_LRU_BATCH ||
+ pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
+ (sp->refaulted + 1) * pv->total * pv->gain;
+}
+
+/******************************************************************************
+ * the aging
+ ******************************************************************************/
+
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_folio_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+/* protect pages accessed multiple times through file descriptors */
+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ int type = folio_is_file_lru(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
+
+ do {
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
+ new_gen = (old_gen + 1) % MAX_NR_GENS;
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
+ /* for folio_end_writeback() */
+ if (reclaiming)
+ new_flags |= BIT(PG_reclaim);
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+
+ return new_gen;
+}
+
+static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
+
+ walk->batched++;
+
+ walk->nr_pages[old_gen][type][zone] -= delta;
+ walk->nr_pages[new_gen][type][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ walk->batched = 0;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ int delta = walk->nr_pages[gen][type][zone];
+
+ if (!delta)
+ continue;
+
+ walk->nr_pages[gen][type][zone] = 0;
+ WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+ lrugen->nr_pages[gen][type][zone] + delta);
+
+ if (lru_gen_is_active(lruvec, gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ }
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
+{
+ struct address_space *mapping;
+ struct vm_area_struct *vma = args->vma;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ if (!vma_is_accessible(vma))
+ return true;
+
+ if (is_vm_hugetlb_page(vma))
+ return true;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
+ return true;
+
+ if (vma == get_gate_vma(vma->vm_mm))
+ return true;
+
+ if (vma_is_anonymous(vma))
+ return !walk->can_swap;
+
+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
+ return true;
+
+ mapping = vma->vm_file->f_mapping;
+ if (mapping_unevictable(mapping))
+ return true;
+
+ if (shmem_mapping(mapping))
+ return !walk->can_swap;
+
+ /* to exclude special mappings like dax, etc. */
+ return !mapping->a_ops->read_folio;
+}
+
+/*
+ * Some userspace memory allocators map many single-page VMAs. Instead of
+ * returning back to the PGD table for each of such VMAs, finish an entire PMD
+ * table to reduce zigzags and improve cache performance.
+ */
+static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
+ unsigned long *vm_start, unsigned long *vm_end)
+{
+ unsigned long start = round_up(*vm_end, size);
+ unsigned long end = (start | ~mask) + 1;
+ VMA_ITERATOR(vmi, args->mm, start);
+
+ VM_WARN_ON_ONCE(mask & size);
+ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
+
+ for_each_vma(vmi, args->vma) {
+ if (end && end <= args->vma->vm_start)
+ return false;
+
+ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
+ continue;
+
+ *vm_start = max(start, args->vma->vm_start);
+ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
+
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
+ return -1;
+
+ if (WARN_ON_ONCE(pmd_devmap(pmd)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+#endif
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat, bool can_swap)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ /* file VMAs can contain anon pages from COW */
+ if (!folio_is_file_lru(folio) && !can_swap)
+ return NULL;
+
+ return folio;
+}
+
+static bool suitable_to_scan(int total, int young)
+{
+ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
+
+ /* suitable if the average number of young PTEs per cacheline is >=1 */
+ return young * n >= total;
+}
+
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pte_t *pte;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int total = 0;
+ int young = 0;
+ struct lru_gen_mm_walk *walk = args->private;
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+ VM_WARN_ON_ONCE(pmd_leaf(*pmd));
+
+ ptl = pte_lockptr(args->mm, pmd);
+ if (!spin_trylock(ptl))
+ return false;
+
+ arch_enter_lazy_mmu_mode();
+
+ pte = pte_offset_map(pmd, start & PMD_MASK);
+restart:
+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+ struct folio *folio;
+
+ total++;
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+
+ pfn = get_pte_pfn(pte[i], args->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i])) {
+ walk->mm_stats[MM_LEAF_OLD]++;
+ continue;
+ }
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ }
+
+ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
+ goto restart;
+
+ pte_unmap(pte);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(ptl);
+
+ return suitable_to_scan(total, young);
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+{
+ int i;
+ pmd_t *pmd;
+ spinlock_t *ptl;
+ struct lru_gen_mm_walk *walk = args->private;
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
+ if (*start == -1) {
+ *start = next;
+ return;
+ }
+
+ i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
+ if (i && i <= MIN_LRU_BATCH) {
+ __set_bit(i - 1, bitmap);
+ return;
+ }
+
+ pmd = pmd_offset(pud, *start);
+
+ ptl = pmd_lockptr(args->mm, pmd);
+ if (!spin_trylock(ptl))
+ goto done;
+
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ unsigned long pfn;
+ struct folio *folio;
+ unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
+
+ pfn = get_pmd_pfn(pmd[i], vma, addr);
+ if (pfn == -1)
+ goto next;
+
+ if (!pmd_trans_huge(pmd[i])) {
+ if (arch_has_hw_nonleaf_pmd_young() &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
+ pmdp_test_and_clear_young(vma, addr, pmd + i);
+ goto next;
+ }
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ if (!folio)
+ goto next;
+
+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+ goto next;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+next:
+ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
+ } while (i <= MIN_LRU_BATCH);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(ptl);
+done:
+ *start = -1;
+ bitmap_zero(bitmap, MIN_LRU_BATCH);
+}
+#else
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+{
+}
+#endif
+
+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pmd_t *pmd;
+ unsigned long next;
+ unsigned long addr;
+ struct vm_area_struct *vma;
+ unsigned long pos = -1;
+ struct lru_gen_mm_walk *walk = args->private;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+ /*
+ * Finish an entire PMD in two passes: the first only reaches to PTE
+ * tables to avoid taking the PMD lock; the second, if necessary, takes
+ * the PMD lock to clear the accessed bit in PMD entries.
+ */
+ pmd = pmd_offset(pud, start & PUD_MASK);
+restart:
+ /* walk_pte_range() may call get_next_vma() */
+ vma = args->vma;
+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+ pmd_t val = pmdp_get_lockless(pmd + i);
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(val) || is_huge_zero_pmd(val)) {
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+ continue;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(val)) {
+ unsigned long pfn = pmd_pfn(val);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+
+ if (!pmd_young(val)) {
+ walk->mm_stats[MM_LEAF_OLD]++;
+ continue;
+ }
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ continue;
+
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ continue;
+ }
+#endif
+ walk->mm_stats[MM_NONLEAF_TOTAL]++;
+
+ if (arch_has_hw_nonleaf_pmd_young() &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (!pmd_young(val))
+ continue;
+
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ }
+
+ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ continue;
+
+ walk->mm_stats[MM_NONLEAF_FOUND]++;
+
+ if (!walk_pte_range(&val, addr, next, args))
+ continue;
+
+ walk->mm_stats[MM_NONLEAF_ADDED]++;
+
+ /* carry over to the next generation */
+ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+ }
+
+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
+
+ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
+ goto restart;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pud_t *pud;
+ unsigned long addr;
+ unsigned long next;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ VM_WARN_ON_ONCE(p4d_leaf(*p4d));
+
+ pud = pud_offset(p4d, start & P4D_MASK);
+restart:
+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
+ pud_t val = READ_ONCE(pud[i]);
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
+ continue;
+
+ walk_pmd_range(&val, addr, next, args);
+
+ /* a racy check to curtail the waiting time */
+ if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
+ return 1;
+
+ if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
+ end = (addr | ~PUD_MASK) + 1;
+ goto done;
+ }
+ }
+
+ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
+ goto restart;
+
+ end = round_up(end, P4D_SIZE);
+done:
+ if (!end || !args->vma)
+ return 1;
+
+ walk->next_addr = max(end, args->vma->vm_start);
+
+ return -EAGAIN;
+}
+
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ static const struct mm_walk_ops mm_walk_ops = {
+ .test_walk = should_skip_vma,
+ .p4d_entry = walk_pud_range,
+ };
+
+ int err;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ walk->next_addr = FIRST_USER_ADDRESS;
+
+ do {
+ err = -EBUSY;
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ break;
+
+ /* the caller might be holding the lock for write */
+ if (mmap_read_trylock(mm)) {
+ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
+
+ mmap_read_unlock(mm);
+ }
+
+ mem_cgroup_unlock_pages();
+
+ if (walk->batched) {
+ spin_lock_irq(&lruvec->lru_lock);
+ reset_batch_size(lruvec, walk);
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while (err == -EAGAIN);
+}
+
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+ if (pgdat && current_is_kswapd()) {
+ VM_WARN_ON_ONCE(walk);
+
+ walk = &pgdat->mm_walk;
+ } else if (!pgdat && !walk) {
+ VM_WARN_ON_ONCE(current_is_kswapd());
+
+ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ }
+
+ current->reclaim_state->mm_walk = walk;
+
+ return walk;
+}
+
+static void clear_mm_walk(void)
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
+
+ current->reclaim_state->mm_walk = NULL;
+
+ if (!current_is_kswapd())
+ kfree(walk);
+}
+
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+{
+ int zone;
+ int remaining = MAX_LRU_BATCH;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (type == LRU_GEN_ANON && !can_swap)
+ goto done;
+
+ /* prevent cold/hot inversion if force_scan is true */
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ new_gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+done:
+ reset_ctrl_pos(lruvec, type, true);
+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
+}
+
+static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+{
+ int gen, type, zone;
+ bool success = false;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ DEFINE_MIN_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+ /* find the oldest populated generation */
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
+ gen = lru_gen_from_seq(min_seq[type]);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ goto next;
+ }
+
+ min_seq[type]++;
+ }
+next:
+ ;
+ }
+
+ /* see the comment on lru_gen_struct */
+ if (can_swap) {
+ min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
+ min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ if (min_seq[type] == lrugen->min_seq[type])
+ continue;
+
+ reset_ctrl_pos(lruvec, type, true);
+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
+ success = true;
+ }
+
+ return success;
+}
+
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+{
+ int prev, next;
+ int type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+ for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ continue;
+
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
+
+ while (!inc_min_seq(lruvec, type, can_swap)) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+ }
+
+ /*
+ * Update the active/inactive LRU sizes for compatibility. Both sides of
+ * the current max_seq need to be covered, since max_seq+1 can overlap
+ * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
+ * overlap, cold/hot inversion happens.
+ */
+ prev = lru_gen_from_seq(lrugen->max_seq - 1);
+ next = lru_gen_from_seq(lrugen->max_seq + 1);
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ long delta = lrugen->nr_pages[prev][type][zone] -
+ lrugen->nr_pages[next][type][zone];
+
+ if (!delta)
+ continue;
+
+ __update_lru_size(lruvec, lru, zone, delta);
+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
+ }
+ }
+
+ for (type = 0; type < ANON_AND_FILE; type++)
+ reset_ctrl_pos(lruvec, type, false);
+
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
+ /* make sure preceding modifications appear */
+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, bool force_scan)
+{
+ bool success;
+ struct lru_gen_mm_walk *walk;
+ struct mm_struct *mm = NULL;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+ /* see the comment in iterate_mm_list() */
+ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
+ success = false;
+ goto done;
+ }
+
+ /*
+ * If the hardware doesn't automatically set the accessed bit, fallback
+ * to lru_gen_look_around(), which only clears the accessed bit in a
+ * handful of PTEs. Spreading the work out over a period of time usually
+ * is less efficient, but it avoids bursty page faults.
+ */
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
+ goto done;
+ }
+
+ walk = set_mm_walk(NULL);
+ if (!walk) {
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
+ goto done;
+ }
+
+ walk->lruvec = lruvec;
+ walk->max_seq = max_seq;
+ walk->can_swap = can_swap;
+ walk->force_scan = force_scan;
+
+ do {
+ success = iterate_mm_list(lruvec, walk, &mm);
+ if (mm)
+ walk_mm(lruvec, mm, walk);
+
+ cond_resched();
+ } while (mm);
+done:
+ if (!success) {
+ if (sc->priority <= DEF_PRIORITY - 2)
+ wait_event_killable(lruvec->mm_state.wait,
+ max_seq < READ_ONCE(lrugen->max_seq));
+
+ return max_seq < READ_ONCE(lrugen->max_seq);
+ }
+
+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+
+ inc_max_seq(lruvec, can_swap, force_scan);
+ /* either this sees any waiters or they will see updated max_seq */
+ if (wq_has_sleeper(&lruvec->mm_state.wait))
+ wake_up_all(&lruvec->mm_state.wait);
+
+ return true;
+}
+
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+ int gen, type, zone;
+ unsigned long old = 0;
+ unsigned long young = 0;
+ unsigned long total = 0;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ unsigned long size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ total += size;
+ if (seq == max_seq)
+ young += size;
+ else if (seq + MIN_NR_GENS == max_seq)
+ old += size;
+ }
+ }
+
+ /* try to scrape all its memory if this memcg was deleted */
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+ /*
+ * The aging tries to be lazy to reduce the overhead, while the eviction
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ * ideal number of generations is MIN_NR_GENS+1.
+ */
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
+ return true;
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ return false;
+
+ /*
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+ * of the total number of pages for each generation. A reasonable range
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+ * aging cares about the upper bound of hot pages, while the eviction
+ * cares about the lower bound of cold pages.
+ */
+ if (young * MIN_NR_GENS > total)
+ return true;
+ if (old * (MIN_NR_GENS + 2) < total)
+ return true;
+
+ return false;
+}
+
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
+{
+ bool need_aging;
+ unsigned long nr_to_scan;
+ int swappiness = get_swappiness(lruvec, sc);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ if (mem_cgroup_below_min(NULL, memcg))
+ return false;
+
+ need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+
+ if (min_ttl) {
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_after_jiffies(birth + min_ttl))
+ return false;
+
+ /* the size is likely too small to be helpful */
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+ return false;
+ }
+
+ if (need_aging)
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+
+ return true;
+}
+
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+ bool success = false;
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+
+ VM_WARN_ON_ONCE(!current_is_kswapd());
+
+ sc->last_reclaimed = sc->nr_reclaimed;
+
+ /*
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ return;
+ }
+
+ set_mm_walk(pgdat);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ if (age_lruvec(lruvec, sc, min_ttl))
+ success = true;
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ clear_mm_walk();
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (success || !min_ttl || sc->order)
+ return;
+
+ /*
+ * The main goal is to OOM kill if every generation from all memcgs is
+ * younger than min_ttl. However, another possibility is all memcgs are
+ * either below min or empty.
+ */
+ if (mutex_trylock(&oom_lock)) {
+ struct oom_control oc = {
+ .gfp_mask = sc->gfp_mask,
+ };
+
+ out_of_memory(&oc);
+
+ mutex_unlock(&oom_lock);
+ }
+}
+
+/*
+ * This function exploits spatial locality when shrink_folio_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
+ * the scan was done cacheline efficiently, it adds the PMD entry pointing to
+ * the PTE table to the Bloom filter. This forms a feedback loop between the
+ * eviction and the aging.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ struct lru_gen_mm_walk *walk;
+ int young = 0;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ /* avoid taking the LRU lock under the PTL when possible */
+ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ young++;
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ /* feedback from rmap walkers to page table walkers */
+ if (suitable_to_scan(i, young))
+ update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+ if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ if (!walk) {
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+ }
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ if (walk)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ else
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ if (!walk)
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+
+/******************************************************************************
+ * the eviction
+ ******************************************************************************/
+
+static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+{
+ bool success;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ int tier = lru_tier_from_refs(refs);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
+
+ /* unevictable */
+ if (!folio_evictable(folio)) {
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+ folio_set_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
+ return true;
+ }
+
+ /* dirty lazyfree */
+ if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+ folio_set_swapbacked(folio);
+ lruvec_add_folio_tail(lruvec, folio);
+ return true;
+ }
+
+ /* promoted */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
+ /* protected */
+ if (tier > tier_idx) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
+ lrugen->protected[hist][type][tier - 1] + delta);
+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+ return true;
+ }
+
+ /* waiting for writeback */
+ if (folio_test_locked(folio) || folio_test_writeback(folio) ||
+ (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ gen = folio_inc_gen(lruvec, folio, true);
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
+ return false;
+}
+
+static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
+{
+ bool success;
+
+ /* unmapping inhibited */
+ if (!sc->may_unmap && folio_mapped(folio))
+ return false;
+
+ /* swapping inhibited */
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ (folio_test_dirty(folio) ||
+ (folio_test_anon(folio) && !folio_test_swapcache(folio))))
+ return false;
+
+ /* raced with release_pages() */
+ if (!folio_try_get(folio))
+ return false;
+
+ /* raced with another isolation */
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return false;
+ }
+
+ /* see the comment on MAX_NR_TIERS */
+ if (!folio_test_referenced(folio))
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+
+ /* for shrink_folio_list() */
+ folio_clear_reclaim(folio);
+ folio_clear_referenced(folio);
+
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+
+ return true;
+}
+
+static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
+ int type, int tier, struct list_head *list)
+{
+ int gen, zone;
+ enum vm_event_item item;
+ int sorted = 0;
+ int scanned = 0;
+ int isolated = 0;
+ int remaining = MAX_LRU_BATCH;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ VM_WARN_ON_ONCE(!list_empty(list));
+
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
+ return 0;
+
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ LIST_HEAD(moved);
+ int skipped = 0;
+ struct list_head *head = &lrugen->lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+ int delta = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ scanned += delta;
+
+ if (sort_folio(lruvec, folio, tier))
+ sorted += delta;
+ else if (isolate_folio(lruvec, folio, sc)) {
+ list_add(&folio->lru, list);
+ isolated += delta;
+ } else {
+ list_move(&folio->lru, &moved);
+ skipped += delta;
+ }
+
+ if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
+ break;
+ }
+
+ if (skipped) {
+ list_splice(&moved, head);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+ }
+
+ if (!remaining || isolated >= MIN_LRU_BATCH)
+ break;
+ }
+
+ item = PGSCAN_KSWAPD + reclaimer_offset();
+ if (!cgroup_reclaim(sc)) {
+ __count_vm_events(item, isolated);
+ __count_vm_events(PGREFILL, sorted);
+ }
+ __count_memcg_events(memcg, item, isolated);
+ __count_memcg_events(memcg, PGREFILL, sorted);
+ __count_vm_events(PGSCAN_ANON + type, isolated);
+
+ /*
+ * There might not be eligible pages due to reclaim_idx, may_unmap and
+ * may_writepage. Check the remaining to prevent livelock if it's not
+ * making progress.
+ */
+ return isolated || !remaining ? scanned : 0;
+}
+
+static int get_tier_idx(struct lruvec *lruvec, int type)
+{
+ int tier;
+ struct ctrl_pos sp, pv;
+
+ /*
+ * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * This value is chosen because any other tier would have at least twice
+ * as many refaults as the first tier.
+ */
+ read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ return tier - 1;
+}
+
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+{
+ int type, tier;
+ struct ctrl_pos sp, pv;
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+
+ /*
+ * Compare the first tier of anon with that of file to determine which
+ * type to scan. Also need to compare other tiers of the selected type
+ * with the first tier of the other type to determine the last tier (of
+ * the selected type) to evict.
+ */
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
+ type = positive_ctrl_err(&sp, &pv);
+
+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ *tier_idx = tier - 1;
+
+ return type;
+}
+
+static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ int *type_scanned, struct list_head *list)
+{
+ int i;
+ int type;
+ int scanned;
+ int tier = -1;
+ DEFINE_MIN_SEQ(lruvec);
+
+ /*
+ * Try to make the obvious choice first. When anon and file are both
+ * available from the same generation, interpret swappiness 1 as file
+ * first and 200 as anon first.
+ */
+ if (!swappiness)
+ type = LRU_GEN_FILE;
+ else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
+ type = LRU_GEN_ANON;
+ else if (swappiness == 1)
+ type = LRU_GEN_FILE;
+ else if (swappiness == 200)
+ type = LRU_GEN_ANON;
+ else
+ type = get_type_to_scan(lruvec, swappiness, &tier);
+
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
+ if (tier < 0)
+ tier = get_tier_idx(lruvec, type);
+
+ scanned = scan_folios(lruvec, sc, type, tier, list);
+ if (scanned)
+ break;
+
+ type = !type;
+ tier = -1;
+ }
+
+ *type_scanned = type;
+
+ return scanned;
+}
+
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *need_swapping)
+{
+ int type;
+ int scanned;
+ int reclaimed;
+ LIST_HEAD(list);
+ LIST_HEAD(clean);
+ struct folio *folio;
+ struct folio *next;
+ enum vm_event_item item;
+ struct reclaim_stat stat;
+ struct lru_gen_mm_walk *walk;
+ bool skip_retry = false;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+
+ scanned += try_to_inc_min_seq(lruvec, swappiness);
+
+ if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ scanned = 0;
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ if (list_empty(&list))
+ return scanned;
+retry:
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr_reclaimed += reclaimed;
+
+ list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ if (!folio_evictable(folio)) {
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ continue;
+ }
+
+ if (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))) {
+ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+ if (folio_test_workingset(folio))
+ folio_set_referenced(folio);
+ continue;
+ }
+
+ if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
+ folio_mapped(folio) || folio_test_locked(folio) ||
+ folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ /* don't add rejected folios to the oldest generation */
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
+ BIT(PG_active));
+ continue;
+ }
+
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ list_move(&folio->lru, &clean);
+ sc->nr_scanned -= folio_nr_pages(folio);
+ }
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ move_folios_to_lru(lruvec, &list);
+
+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ reset_batch_size(lruvec, walk);
+
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, reclaimed);
+ __count_memcg_events(memcg, item, reclaimed);
+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_uncharge_list(&list);
+ free_unref_page_list(&list);
+
+ INIT_LIST_HEAD(&list);
+ list_splice_init(&clean, &list);
+
+ if (!list_empty(&list)) {
+ skip_retry = true;
+ goto retry;
+ }
+
+ if (need_swapping && type == LRU_GEN_ANON)
+ *need_swapping = true;
+
+ return scanned;
+}
+
+/*
+ * For future optimizations:
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+ * reclaim.
+ */
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+ bool can_swap, bool *need_aging)
+{
+ unsigned long nr_to_scan;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
+ (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
+ !sc->memcg_low_reclaim))
+ return 0;
+
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+ if (!*need_aging)
+ return nr_to_scan;
+
+ /* skip the aging path at the default priority */
+ if (sc->priority == DEF_PRIORITY)
+ goto done;
+
+ /* leave the work to lru_gen_age_node() */
+ if (current_is_kswapd())
+ return 0;
+
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+ return nr_to_scan;
+done:
+ return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+}
+
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg at most once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the allocation
+ * may succeed if all suitable zones are somewhat safe. In either case,
+ * it's better to stop now, and restart later if necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ bool need_aging = false;
+ bool need_swapping = false;
+ unsigned long scanned = 0;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(lruvec_pgdat(lruvec));
+
+ while (true) {
+ int delta;
+ int swappiness;
+ unsigned long nr_to_scan;
+
+ if (sc->may_swap)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
+ swappiness = 1;
+ else
+ swappiness = 0;
+
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
+ if (!nr_to_scan)
+ goto done;
+
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
+ if (!delta)
+ goto done;
+
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
+
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
+ break;
+
+ cond_resched();
+ }
+
+ /* see the comment in lru_gen_age_node() */
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+ sc->memcgs_need_aging = false;
+done:
+ clear_mm_walk();
+
+ blk_finish_plug(&plug);
+}
+
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enabled)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enabled == lru_gen_enabled())
+ goto unlock;
+
+ if (enabled)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enabled;
+
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return sysfs_emit(buf, "0x%04x\n", caps);
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enabled = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enabled);
+ else if (enabled)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enabled, store_enabled
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(memcg, nid);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(memcg, nid);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ const char *s = " ";
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ s = "RT ";
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ s = "rep";
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ }
+
+ for (i = 0; i < 3; i++)
+ seq_printf(m, " %10lu%c", n[i], s[i]);
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ const char *s = " ";
+ unsigned long n = 0;
+
+ if (seq == max_seq && NR_HIST_GENS == 1) {
+ s = "LOYNFA";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
+ s = "loynfa";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ }
+
+ seq_printf(m, " %10lu%c", n, s[i]);
+ }
+ seq_putc(m, '\n');
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[LRU_GEN_ANON];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int type, zone;
+ int gen = lru_gen_from_seq(seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long size = 0;
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ seq_printf(m, " %10lu%c", size, mark);
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ bool can_swap, bool force_scan)
+{
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < max_seq)
+ return 0;
+
+ if (seq > max_seq)
+ return -EINVAL;
+
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
+ return -ERANGE;
+
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+
+ return 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq + MIN_NR_GENS > max_seq)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness])
+ return 0;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return 0;
+
+ if (!evict_folios(lruvec, sc, swappiness, NULL))
+ return 0;
+
+ cond_resched();
+ }
+
+ return -EINTR;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ struct scan_control *sc, int swappiness, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ return -EINVAL;
+ }
+
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ lruvec = get_lruvec(memcg, nid);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ struct blk_plug plug;
+ int err = -EINVAL;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ blk_start_plug(&plug);
+ if (!set_mm_walk(NULL)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+ if (err)
+ break;
+ }
+done:
+ clear_mm_walk();
+ blk_finish_plug(&plug);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int i;
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
+
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+
+ lruvec->mm_state.seq = MIN_NR_GENS;
+ init_waitqueue_head(&lruvec->mm_state.wait);
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
+ spin_lock_init(&memcg->mm_list.lock);
+}
+
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+ int i;
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ sizeof(lruvec->lrugen.nr_pages)));
+
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+ bitmap_free(lruvec->mm_state.filters[i]);
+ lruvec->mm_state.filters[i] = NULL;
+ }
+ }
+}
+#endif
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -2955,8 +5910,13 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
+
+ if (lru_gen_enabled()) {
+ lru_gen_shrink_lruvec(lruvec, sc);
+ return;
+ }
get_scan_count(lruvec, sc, nr);
@@ -2974,8 +5934,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2995,7 +5955,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
@@ -3046,8 +6006,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
@@ -3154,13 +6112,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
mem_cgroup_calculate_protection(target_memcg, memcg);
- if (mem_cgroup_below_min(memcg)) {
+ if (mem_cgroup_below_min(target_memcg, memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- } else if (mem_cgroup_below_low(memcg)) {
+ } else if (mem_cgroup_below_low(target_memcg, memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -3197,109 +6155,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
- /*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
- */
- mem_cgroup_flush_stats();
-
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
@@ -3557,11 +6422,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
struct lruvec *target_lruvec;
unsigned long refaults;
+ if (lru_gen_enabled())
+ return;
+
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
+ target_lruvec->refaults[WORKINGSET_ANON] = refaults;
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
- target_lruvec->refaults[1] = refaults;
+ target_lruvec->refaults[WORKINGSET_FILE] = refaults;
}
/*
@@ -3886,7 +6754,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options)
+ unsigned int reclaim_options,
+ nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -3901,6 +6770,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -3923,12 +6793,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct pglist_data *pgdat,
- struct scan_control *sc)
+static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
+ if (lru_gen_enabled()) {
+ lru_gen_age_node(pgdat, sc);
+ return;
+ }
+
if (!can_age_anon_pages(pgdat, sc))
return;
@@ -4248,12 +7122,11 @@ restart:
sc.may_swap = !nr_boost_reclaim;
/*
- * Do some background aging of the anon list, to give
- * pages a chance to be referenced before reclaiming. All
- * pages are rotated regardless of classzone as this is
- * about consistent aging.
+ * Do some background aging, to give pages a chance to be
+ * referenced before reclaiming. All pages are rotated
+ * regardless of classzone as this is about consistent aging.
*/
- age_active_anon(pgdat, &sc);
+ kswapd_age_node(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
@@ -4643,16 +7516,17 @@ void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- if (pgdat->kswapd)
- return;
-
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
- if (IS_ERR(pgdat->kswapd)) {
- /* failure at boot is fatal */
- BUG_ON(system_state < SYSTEM_RUNNING);
- pr_err("Failed to start kswapd on node %d\n", nid);
- pgdat->kswapd = NULL;
+ pgdat_kswapd_lock(pgdat);
+ if (!pgdat->kswapd) {
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ pgdat->kswapd = NULL;
+ }
}
+ pgdat_kswapd_unlock(pgdat);
}
/*
@@ -4661,12 +7535,16 @@ void kswapd_run(int nid)
*/
void kswapd_stop(int nid)
{
- struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct task_struct *kswapd;
+ pgdat_kswapd_lock(pgdat);
+ kswapd = pgdat->kswapd;
if (kswapd) {
kthread_stop(kswapd);
- NODE_DATA(nid)->kswapd = NULL;
+ pgdat->kswapd = NULL;
}
+ pgdat_kswapd_unlock(pgdat);
}
static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 90af9a8572f5..1ea6a5ce1c41 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,7 +28,6 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
-#include <linux/migrate.h>
#include "internal.h"
@@ -355,8 +354,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
* CPU migrations and preemption potentially corrupts a counter so
* disable preemption.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -368,8 +366,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
__this_cpu_write(*p, x);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -393,8 +390,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -406,8 +402,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
__this_cpu_write(*p, x);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -441,8 +436,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -453,8 +447,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, -overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -466,8 +459,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -478,8 +470,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, -overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -501,8 +492,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -513,8 +503,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -526,8 +515,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -538,8 +526,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -1247,11 +1234,13 @@ const char * const vmstat_text[] = {
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+ "nr_sec_page_table_pages",
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif
#ifdef CONFIG_NUMA_BALANCING
"pgpromote_success",
+ "pgpromote_candidate",
#endif
/* enum writeback_stat_item counters */
@@ -1282,10 +1271,13 @@ const char * const vmstat_text[] = {
"pgreuse",
"pgsteal_kswapd",
"pgsteal_direct",
+ "pgsteal_khugepaged",
"pgdemote_kswapd",
"pgdemote_direct",
+ "pgdemote_khugepaged",
"pgscan_kswapd",
"pgscan_direct",
+ "pgscan_khugepaged",
"pgscan_direct_throttle",
"pgscan_anon",
"pgscan_file",
@@ -1389,10 +1381,6 @@ const char * const vmstat_text[] = {
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- "vmacache_find_calls",
- "vmacache_find_hits",
-#endif
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
@@ -2067,7 +2055,6 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
- set_migration_target_nodes();
}
return 0;
@@ -2092,7 +2079,6 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
- set_migration_target_nodes();
return 0;
}
@@ -2125,7 +2111,6 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
- migrate_on_reclaim_init();
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
diff --git a/mm/workingset.c b/mm/workingset.c
index a5e84862fc86..1a86645b7b3c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
- eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
- *evictionp = entry << bucket_order;
+ *evictionp = entry;
*workingsetp = workingset;
}
+#ifdef CONFIG_LRU_GEN
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ int hist;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lru_gen_struct *lrugen;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ int tier = lru_tier_from_refs(refs);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->lrugen;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
+
+ hist = lru_hist_from_seq(min_seq);
+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+ int hist, tier, refs;
+ int memcg_id;
+ bool workingset;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lru_gen_struct *lrugen;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
+
+ if (pgdat != folio_pgdat(folio))
+ return;
+
+ rcu_read_lock();
+
+ memcg = folio_memcg_rcu(folio);
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto unlock;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->lrugen;
+
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
+ goto unlock;
+
+ hist = lru_hist_from_seq(min_seq);
+ /* see the comment in folio_lru_refs() */
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
+ tier = lru_tier_from_refs(refs);
+
+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+
+ /*
+ * Count the following two cases as stalls:
+ * 1. For pages accessed through page tables, hotter pages pushed out
+ * hot pages which refaulted immediately.
+ * 2. For pages accessed multiple times through file descriptors,
+ * numbers of accesses might have been out of the range.
+ */
+ if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
+ folio_set_workingset(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
+ }
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ return NULL;
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -264,10 +360,14 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(folio);
+
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
+ eviction >>= bucket_order;
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
return pack_shadow(memcgid, pgdat, eviction,
folio_test_workingset(folio));
@@ -298,7 +398,13 @@ void workingset_refault(struct folio *folio, void *shadow)
int memcgid;
long nr;
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ eviction <<= bucket_order;
rcu_read_lock();
/*
@@ -386,8 +492,11 @@ void workingset_refault(struct folio *folio, void *shadow)
/* Folio was active prior to eviction */
if (workingset) {
folio_set_workingset(folio);
- /* XXX: Move to lru_cache_add() when it supports new vs putback */
- lru_note_cost_folio(folio);
+ /*
+ * XXX: Move to folio_add_lru() when it supports new vs
+ * putback
+ */
+ lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index cf71da10d04e..a4de0c317ac7 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -68,9 +68,6 @@
* Structures
*****************/
struct z3fold_pool;
-struct z3fold_ops {
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
enum buddy {
HEADLESS = 0,
@@ -138,8 +135,6 @@ struct z3fold_header {
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
- * @ops: pointer to a structure of user defined operations specified at
- * pool creation time.
* @zpool: zpool driver
* @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
@@ -158,7 +153,6 @@ struct z3fold_pool {
struct list_head stale;
atomic64_t pages_nr;
struct kmem_cache *c_handle;
- const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
@@ -907,13 +901,11 @@ out_fail:
* z3fold_create_pool() - create a new z3fold pool
* @name: pool name
* @gfp: gfp flags when allocating the z3fold pool structure
- * @ops: user-defined operations for the z3fold pool
*
* Return: pointer to the new z3fold pool or NULL if the metadata allocation
* failed.
*/
-static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
- const struct z3fold_ops *ops)
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
{
struct z3fold_pool *pool = NULL;
int i, cpu;
@@ -949,7 +941,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
if (!pool->release_wq)
goto out_wq;
INIT_WORK(&pool->work, free_pages_work);
- pool->ops = ops;
return pool;
out_wq:
@@ -1230,10 +1221,6 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || retries == 0) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
for (i = 0; i < retries; i++) {
if (list_empty(&pool->lru)) {
spin_unlock(&pool->lock);
@@ -1319,17 +1306,17 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
}
/* Issue the eviction callback(s) */
if (middle_handle) {
- ret = pool->ops->evict(pool, middle_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, middle_handle);
if (ret)
goto next;
}
if (first_handle) {
- ret = pool->ops->evict(pool, first_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, first_handle);
if (ret)
goto next;
}
if (last_handle) {
- ret = pool->ops->evict(pool, last_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, last_handle);
if (ret)
goto next;
}
@@ -1593,26 +1580,13 @@ static const struct movable_operations z3fold_mops = {
* zpool
****************/
-static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct z3fold_ops z3fold_zpool_ops = {
- .evict = z3fold_zpool_evict
-};
-
static void *z3fold_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct z3fold_pool *pool;
- pool = z3fold_create_pool(name, gfp,
- zpool_ops ? &z3fold_zpool_ops : NULL);
+ pool = z3fold_create_pool(name, gfp);
if (pool) {
pool->zpool = zpool;
pool->zpool_ops = zpool_ops;
diff --git a/mm/zbud.c b/mm/zbud.c
index 6348932430b8..3acd26193920 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -74,10 +74,6 @@
struct zbud_pool;
-struct zbud_ops {
- int (*evict)(struct zbud_pool *pool, unsigned long handle);
-};
-
/**
* struct zbud_pool - stores metadata for each zbud pool
* @lock: protects all pool fields and first|last_chunk fields of any
@@ -90,8 +86,6 @@ struct zbud_ops {
* @lru: list tracking the zbud pages in LRU order by most recently
* added buddy.
* @pages_nr: number of zbud pages in the pool.
- * @ops: pointer to a structure of user defined operations specified at
- * pool creation time.
* @zpool: zpool driver
* @zpool_ops: zpool operations structure with an evict callback
*
@@ -110,7 +104,6 @@ struct zbud_pool {
};
struct list_head lru;
u64 pages_nr;
- const struct zbud_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
};
@@ -212,12 +205,11 @@ static int num_free_chunks(struct zbud_header *zhdr)
/**
* zbud_create_pool() - create a new zbud pool
* @gfp: gfp flags when allocating the zbud pool structure
- * @ops: user-defined operations for the zbud pool
*
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+static struct zbud_pool *zbud_create_pool(gfp_t gfp)
{
struct zbud_pool *pool;
int i;
@@ -231,7 +223,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
INIT_LIST_HEAD(&pool->buddied);
INIT_LIST_HEAD(&pool->lru);
pool->pages_nr = 0;
- pool->ops = ops;
return pool;
}
@@ -419,8 +410,7 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
unsigned long first_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
- retries == 0) {
+ if (list_empty(&pool->lru)) {
spin_unlock(&pool->lock);
return -EINVAL;
}
@@ -444,12 +434,12 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
/* Issue the eviction callback(s) */
if (first_handle) {
- ret = pool->ops->evict(pool, first_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, first_handle);
if (ret)
goto next;
}
if (last_handle) {
- ret = pool->ops->evict(pool, last_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, last_handle);
if (ret)
goto next;
}
@@ -524,25 +514,13 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool)
* zpool
****************/
-static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct zbud_ops zbud_zpool_ops = {
- .evict = zbud_zpool_evict
-};
-
static void *zbud_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct zbud_pool *pool;
- pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ pool = zbud_create_pool(gfp);
if (pool) {
pool->zpool = zpool;
pool->zpool_ops = zpool_ops;
diff --git a/mm/zpool.c b/mm/zpool.c
index 68facc193496..571f5c5031dd 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -21,9 +21,6 @@
struct zpool {
struct zpool_driver *driver;
void *pool;
- const struct zpool_ops *ops;
- bool evictable;
- bool can_sleep_mapped;
};
static LIST_HEAD(drivers_head);
@@ -177,9 +174,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
zpool->driver = driver;
zpool->pool = driver->create(name, gfp, ops, zpool);
- zpool->ops = ops;
- zpool->evictable = driver->shrink && ops && ops->evict;
- zpool->can_sleep_mapped = driver->sleep_mapped;
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -380,18 +374,25 @@ u64 zpool_get_total_size(struct zpool *zpool)
*/
bool zpool_evictable(struct zpool *zpool)
{
- return zpool->evictable;
+ return zpool->driver->shrink;
}
/**
* zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
* @zpool: The zpool to test
*
+ * Some allocators enter non-preemptible context in ->map() callback (e.g.
+ * disable pagefaults) and exit that context in ->unmap(), which limits what
+ * we can do with the mapped object. For instance, we cannot wait for
+ * asynchronous crypto API to decompress such an object or take mutexes
+ * since those will call into the scheduler. This function tells us whether
+ * we use such an allocator.
+ *
* Returns: true if zpool can sleep; false otherwise.
*/
bool zpool_can_sleep_mapped(struct zpool *zpool)
{
- return zpool->can_sleep_mapped;
+ return zpool->driver->sleep_mapped;
}
MODULE_LICENSE("GPL");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 907c9b1e1e61..9445bee6b014 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -33,8 +33,7 @@
/*
* lock ordering:
* page_lock
- * pool->migrate_lock
- * class->lock
+ * pool->lock
* zspage->lock
*/
@@ -192,7 +191,6 @@ static const int fullness_threshold_frac = 4;
static size_t huge_class_size;
struct size_class {
- spinlock_t lock;
struct list_head fullness_list[NR_ZS_FULLNESS];
/*
* Size of objects stored in this class. Must be multiple
@@ -241,14 +239,20 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
+#ifdef CONFIG_ZPOOL
+ /* List tracking the zspages in LRU order by most recently added object */
+ struct list_head lru;
+ struct zpool *zpool;
+ const struct zpool_ops *zpool_ops;
+#endif
+
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
#ifdef CONFIG_COMPACTION
struct work_struct free_work;
#endif
- /* protect page/zspage migration */
- rwlock_t migrate_lock;
+ spinlock_t lock;
};
struct zspage {
@@ -263,10 +267,17 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
+
+#ifdef CONFIG_ZPOOL
+ /* links the zspage to the lru list in the pool */
+ struct list_head lru;
+ bool under_reclaim;
+ /* list of unfreed handles whose objects have been reclaimed */
+ unsigned long *deferred_handles;
+#endif
+
struct zs_pool *pool;
-#ifdef CONFIG_COMPACTION
rwlock_t lock;
-#endif
};
struct mapping_area {
@@ -287,10 +298,11 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-#ifdef CONFIG_COMPACTION
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage);
static void migrate_write_lock_nested(struct zspage *zspage);
static void migrate_write_unlock(struct zspage *zspage);
@@ -298,9 +310,6 @@ static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static void migrate_lock_init(struct zspage *zspage) {}
-static void migrate_read_lock(struct zspage *zspage) {}
-static void migrate_read_unlock(struct zspage *zspage) {}
static void migrate_write_lock(struct zspage *zspage) {}
static void migrate_write_lock_nested(struct zspage *zspage) {}
static void migrate_write_unlock(struct zspage *zspage) {}
@@ -355,7 +364,7 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
kmem_cache_free(pool->zspage_cachep, zspage);
}
-/* class->lock(which owns the handle) synchronizes races */
+/* pool->lock(which owns the handle) synchronizes races */
static void record_obj(unsigned long handle, unsigned long obj)
{
*(unsigned long *)handle = obj;
@@ -374,7 +383,14 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
* different contexts and its caller must provide a valid
* gfp mask.
*/
- return zs_create_pool(name);
+ struct zs_pool *pool = zs_create_pool(name);
+
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+
+ return pool;
}
static void zs_zpool_destroy(void *pool)
@@ -387,7 +403,7 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
{
*handle = zs_malloc(pool, size, gfp);
- if (IS_ERR((void *)(*handle)))
+ if (IS_ERR_VALUE(*handle))
return PTR_ERR((void *)*handle);
return 0;
}
@@ -396,6 +412,27 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries);
+
+static int zs_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zs_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -434,6 +471,7 @@ static struct zpool_driver zs_zpool_driver = {
.malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
+ .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -452,7 +490,7 @@ static __maybe_unused int is_first_page(struct page *page)
return PagePrivate(page);
}
-/* Protected by class->lock */
+/* Protected by pool->lock */
static inline int get_zspage_inuse(struct zspage *zspage)
{
return zspage->inuse;
@@ -472,12 +510,12 @@ static inline struct page *get_first_page(struct zspage *zspage)
return first_page;
}
-static inline int get_first_obj_offset(struct page *page)
+static inline unsigned int get_first_obj_offset(struct page *page)
{
return page->page_type;
}
-static inline void set_first_obj_offset(struct page *page, int offset)
+static inline void set_first_obj_offset(struct page *page, unsigned int offset)
{
page->page_type = offset;
}
@@ -597,13 +635,13 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
if (class->index != i)
continue;
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
freeable = zs_can_compact(class);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
objs_per_zspage = class->objs_per_zspage;
pages_used = obj_allocated / objs_per_zspage *
@@ -907,6 +945,25 @@ unlock:
return 0;
}
+#ifdef CONFIG_ZPOOL
+/*
+ * Free all the deferred handles whose objects are freed in zs_free.
+ */
+static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+{
+ unsigned long handle = (unsigned long)zspage->deferred_handles;
+
+ while (handle) {
+ unsigned long nxt_handle = handle_to_obj(handle);
+
+ cache_free_handle(pool, handle);
+ handle = nxt_handle;
+ }
+}
+#else
+static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
+#endif
+
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
@@ -916,11 +973,14 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
get_zspage_mapping(zspage, &class_idx, &fg);
- assert_spin_locked(&class->lock);
+ assert_spin_locked(&pool->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(fg != ZS_EMPTY);
+ /* Free all deferred handles from zs_free */
+ free_handles(pool, zspage);
+
next = page = get_first_page(zspage);
do {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -956,6 +1016,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
}
remove_zspage(class, zspage, ZS_EMPTY);
+#ifdef CONFIG_ZPOOL
+ list_del(&zspage->lru);
+#endif
__free_zspage(pool, class, zspage);
}
@@ -1001,6 +1064,12 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
off %= PAGE_SIZE;
}
+#ifdef CONFIG_ZPOOL
+ INIT_LIST_HEAD(&zspage->lru);
+ zspage->under_reclaim = false;
+ zspage->deferred_handles = NULL;
+#endif
+
set_freeobj(zspage, 0);
}
@@ -1205,6 +1274,27 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage)
return get_zspage_inuse(zspage) == class->objs_per_zspage;
}
+/**
+ * zs_lookup_class_index() - Returns index of the zsmalloc &size_class
+ * that hold objects of the provided size.
+ * @pool: zsmalloc pool to use
+ * @size: object size
+ *
+ * Context: Any context.
+ *
+ * Return: the index of the zsmalloc &size_class that hold objects of the
+ * provided size.
+ */
+unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size)
+{
+ struct size_class *class;
+
+ class = pool->size_class[get_size_class_index(size)];
+
+ return class->index;
+}
+EXPORT_SYMBOL_GPL(zs_lookup_class_index);
+
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1247,19 +1337,44 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
BUG_ON(in_interrupt());
/* It guarantees it can get zspage from handle safely */
- read_lock(&pool->migrate_lock);
+ spin_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
+#ifdef CONFIG_ZPOOL
+ /*
+ * Move the zspage to front of pool's LRU.
+ *
+ * Note that this is swap-specific, so by definition there are no ongoing
+ * accesses to the memory while the page is swapped out that would make
+ * it "hot". A new entry is hot, then ages to the tail until it gets either
+ * written back or swaps back in.
+ *
+ * Furthermore, map is also called during writeback. We must not put an
+ * isolated page on the LRU mid-reclaim.
+ *
+ * As a result, only update the LRU when the page is mapped for write
+ * when it's first instantiated.
+ *
+ * This is a deviation from the other backends, which perform this update
+ * in the allocation function (zbud_alloc, z3fold_alloc).
+ */
+ if (mm == ZS_MM_WO) {
+ if (!list_empty(&zspage->lru))
+ list_del(&zspage->lru);
+ list_add(&zspage->lru, &pool->lru);
+ }
+#endif
+
/*
- * migration cannot move any zpages in this zspage. Here, class->lock
+ * migration cannot move any zpages in this zspage. Here, pool->lock
* is too heavy since callers would take some time until they calls
* zs_unmap_object API so delegate the locking from class to zspage
* which is smaller granularity.
*/
migrate_read_lock(zspage);
- read_unlock(&pool->migrate_lock);
+ spin_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = (class->size * obj_idx) & ~PAGE_MASK;
@@ -1412,8 +1527,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
- /* class->lock effectively protects the zpage migration */
- spin_lock(&class->lock);
+ /* pool->lock effectively protects the zpage migration */
+ spin_lock(&pool->lock);
zspage = find_get_zspage(class);
if (likely(zspage)) {
obj = obj_malloc(pool, zspage, handle);
@@ -1421,12 +1536,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
fix_fullness_group(class, zspage);
record_obj(handle, obj);
class_stat_inc(class, OBJ_USED, 1);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
return handle;
}
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
zspage = alloc_zspage(pool, class, gfp);
if (!zspage) {
@@ -1434,7 +1549,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
return (unsigned long)ERR_PTR(-ENOMEM);
}
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
obj = obj_malloc(pool, zspage, handle);
newfg = get_fullness_group(class, zspage);
insert_zspage(class, zspage, newfg);
@@ -1447,7 +1562,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
return handle;
}
@@ -1491,26 +1606,38 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
return;
/*
- * The pool->migrate_lock protects the race with zpage's migration
+ * The pool->lock protects the race with zpage's migration
* so it's safe to get the page from handle.
*/
- read_lock(&pool->migrate_lock);
+ spin_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_page(obj, &f_page);
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- spin_lock(&class->lock);
- read_unlock(&pool->migrate_lock);
obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
+
+#ifdef CONFIG_ZPOOL
+ if (zspage->under_reclaim) {
+ /*
+ * Reclaim needs the handles during writeback. It'll free
+ * them along with the zspage when it's done with them.
+ *
+ * Record current deferred handle at the memory location
+ * whose address is given by handle.
+ */
+ record_obj(handle, (unsigned long)zspage->deferred_handles);
+ zspage->deferred_handles = (unsigned long *)handle;
+ spin_unlock(&pool->lock);
+ return;
+ }
+#endif
fullness = fix_fullness_group(class, zspage);
- if (fullness != ZS_EMPTY)
- goto out;
+ if (fullness == ZS_EMPTY)
+ free_zspage(pool, class, zspage);
- free_zspage(pool, class, zspage);
-out:
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1555,6 +1682,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
d_off += size;
d_size -= size;
+ /*
+ * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
+ * calls must occurs in reverse order of calls to kmap_atomic().
+ * So, to call kunmap_atomic(s_addr) we should first call
+ * kunmap_atomic(d_addr). For more details see
+ * Documentation/mm/highmem.rst.
+ */
if (s_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
@@ -1585,7 +1719,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
- int offset = 0;
+ unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
void *addr = kmap_atomic(page);
@@ -1702,7 +1836,7 @@ static enum fullness_group putback_zspage(struct size_class *class,
return fullness;
}
-#ifdef CONFIG_COMPACTION
+#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION)
/*
* To prevent zspage destroy during migration, zspage freeing should
* hold locks of all pages in the zspage.
@@ -1744,6 +1878,24 @@ static void lock_zspage(struct zspage *zspage)
}
migrate_read_unlock(zspage);
}
+#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */
+
+#ifdef CONFIG_ZPOOL
+/*
+ * Unlocks all the pages of the zspage.
+ *
+ * pool->lock must be held before this function is called
+ * to prevent the underlying pages from migrating.
+ */
+static void unlock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+#endif /* CONFIG_ZPOOL */
static void migrate_lock_init(struct zspage *zspage)
{
@@ -1760,6 +1912,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
read_unlock(&zspage->lock);
}
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage)
{
write_lock(&zspage->lock);
@@ -1839,7 +1992,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
struct zspage *zspage;
struct page *dummy;
void *s_addr, *d_addr, *addr;
- int offset;
+ unsigned int offset;
unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
@@ -1860,16 +2013,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
pool = zspage->pool;
/*
- * The pool migrate_lock protects the race between zpage migration
+ * The pool's lock protects the race between zpage migration
* and zs_free.
*/
- write_lock(&pool->migrate_lock);
+ spin_lock(&pool->lock);
class = zspage_class(pool, zspage);
- /*
- * the class lock protects zpage alloc/free in the zspage.
- */
- spin_lock(&class->lock);
/* the migrate_write_lock protects zpage access via zs_map_object */
migrate_write_lock(zspage);
@@ -1899,10 +2048,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
replace_sub_page(class, zspage, newpage, page);
/*
* Since we complete the data copy and set up new zspage structure,
- * it's okay to release migration_lock.
+ * it's okay to release the pool's lock.
*/
- write_unlock(&pool->migrate_lock);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
dec_zspage_isolation(zspage);
migrate_write_unlock(zspage);
@@ -1957,9 +2105,9 @@ static void async_free_zspage(struct work_struct *work)
if (class->index != i)
continue;
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
}
list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
@@ -1969,9 +2117,12 @@ static void async_free_zspage(struct work_struct *work)
get_zspage_mapping(zspage, &class_idx, &fullness);
VM_BUG_ON(fullness != ZS_EMPTY);
class = pool->size_class[class_idx];
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
+#ifdef CONFIG_ZPOOL
+ list_del(&zspage->lru);
+#endif
__free_zspage(pool, class, zspage);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
}
};
@@ -2032,10 +2183,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
- /* protect the race between zpage migration and zs_free */
- write_lock(&pool->migrate_lock);
- /* protect zpage allocation/free */
- spin_lock(&class->lock);
+ /*
+ * protect the race between zpage migration and zs_free
+ * as well as zpage allocation/free
+ */
+ spin_lock(&pool->lock);
while ((src_zspage = isolate_zspage(class, true))) {
/* protect someone accessing the zspage(i.e., zs_map_object) */
migrate_write_lock(src_zspage);
@@ -2060,7 +2212,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(class, dst_zspage);
migrate_write_unlock(dst_zspage);
dst_zspage = NULL;
- if (rwlock_is_contended(&pool->migrate_lock))
+ if (spin_is_contended(&pool->lock))
break;
}
@@ -2077,11 +2229,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
pages_freed += class->pages_per_zspage;
} else
migrate_write_unlock(src_zspage);
- spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ spin_unlock(&pool->lock);
cond_resched();
- write_lock(&pool->migrate_lock);
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
}
if (src_zspage) {
@@ -2089,8 +2239,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
migrate_write_unlock(src_zspage);
}
- spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ spin_unlock(&pool->lock);
return pages_freed;
}
@@ -2103,8 +2252,6 @@ unsigned long zs_compact(struct zs_pool *pool)
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
pages_freed += __zs_compact(pool, class);
@@ -2149,8 +2296,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
@@ -2197,7 +2342,7 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- rwlock_init(&pool->migrate_lock);
+ spin_lock_init(&pool->lock);
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
@@ -2268,7 +2413,6 @@ struct zs_pool *zs_create_pool(const char *name)
class->index = i;
class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = objs_per_zspage;
- spin_lock_init(&class->lock);
pool->size_class[i] = class;
for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
fullness++)
@@ -2288,6 +2432,10 @@ struct zs_pool *zs_create_pool(const char *name)
*/
zs_register_shrinker(pool);
+#ifdef CONFIG_ZPOOL
+ INIT_LIST_HEAD(&pool->lru);
+#endif
+
return pool;
err:
@@ -2329,6 +2477,100 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+#ifdef CONFIG_ZPOOL
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
+{
+ int i, obj_idx, ret = 0;
+ unsigned long handle;
+ struct zspage *zspage;
+ struct page *page;
+ enum fullness_group fullness;
+
+ /* Lock LRU and fullness list */
+ spin_lock(&pool->lock);
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < retries; i++) {
+ struct size_class *class;
+
+ zspage = list_last_entry(&pool->lru, struct zspage, lru);
+ list_del(&zspage->lru);
+
+ /* zs_free may free objects, but not the zspage and handles */
+ zspage->under_reclaim = true;
+
+ class = zspage_class(pool, zspage);
+ fullness = get_fullness_group(class, zspage);
+
+ /* Lock out object allocations and object compaction */
+ remove_zspage(class, zspage, fullness);
+
+ spin_unlock(&pool->lock);
+ cond_resched();
+
+ /* Lock backing pages into place */
+ lock_zspage(zspage);
+
+ obj_idx = 0;
+ page = get_first_page(zspage);
+ while (1) {
+ handle = find_alloced_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
+
+ /*
+ * This will write the object and call zs_free.
+ *
+ * zs_free will free the object, but the
+ * under_reclaim flag prevents it from freeing
+ * the zspage altogether. This is necessary so
+ * that we can continue working with the
+ * zspage potentially after the last object
+ * has been freed.
+ */
+ ret = pool->zpool_ops->evict(pool->zpool, handle);
+ if (ret)
+ goto next;
+
+ obj_idx++;
+ }
+
+next:
+ /* For freeing the zspage, or putting it back in the pool and LRU list. */
+ spin_lock(&pool->lock);
+ zspage->under_reclaim = false;
+
+ if (!get_zspage_inuse(zspage)) {
+ /*
+ * Fullness went stale as zs_free() won't touch it
+ * while the page is removed from the pool. Fix it
+ * up for the check in __free_zspage().
+ */
+ zspage->fullness = ZS_EMPTY;
+
+ __free_zspage(pool, class, zspage);
+ spin_unlock(&pool->lock);
+ return 0;
+ }
+
+ putback_zspage(class, zspage);
+ list_add(&zspage->lru, &pool->lru);
+ unlock_zspage(zspage);
+ }
+
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+#endif /* CONFIG_ZPOOL */
+
static int __init zs_init(void)
{
int ret;
diff --git a/mm/zswap.c b/mm/zswap.c
index 104835b379ec..f6c89049cf70 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -958,7 +958,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
};
if (!zpool_can_sleep_mapped(pool)) {
- tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+ tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!tmp)
return -ENOMEM;
}
@@ -968,6 +968,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
swpentry = zhdr->swpentry; /* here */
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
+ zpool_unmap_handle(pool, handle);
/* find and ref zswap entry */
spin_lock(&tree->lock);
@@ -975,20 +976,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
- zpool_unmap_handle(pool, handle);
kfree(tmp);
return 0;
}
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
- src = (u8 *)zhdr + sizeof(struct zswap_header);
- if (!zpool_can_sleep_mapped(pool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(pool, handle);
- }
-
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -1006,6 +999,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
dlen = PAGE_SIZE;
+ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, handle);
+ }
+
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
@@ -1015,6 +1016,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
dlen = acomp_ctx->req->dlen;
mutex_unlock(acomp_ctx->mutex);
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
+ else
+ zpool_unmap_handle(pool, handle);
+
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -1026,7 +1032,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
SetPageReclaim(page);
/* start writeback */
- __swap_writepage(page, &wbc, end_swap_bio_write);
+ __swap_writepage(page, &wbc);
put_page(page);
zswap_written_back_pages++;
@@ -1045,7 +1051,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- goto end;
+ return ret;
+
+fail:
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
/*
* if we get here due to ZSWAP_SWAPCACHE_EXIST
@@ -1054,17 +1064,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
* if we free the entry in the following put
* it is also okay to return !0
*/
-fail:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
-end:
- if (zpool_can_sleep_mapped(pool))
- zpool_unmap_handle(pool, handle);
- else
- kfree(tmp);
-
return ret;
}
@@ -1311,7 +1314,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
}
if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
- tmp = kmalloc(entry->length, GFP_ATOMIC);
+ tmp = kmalloc(entry->length, GFP_KERNEL);
if (!tmp) {
ret = -ENOMEM;
goto freeentry;