summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig209
-rw-r--r--mm/Kconfig.debug11
-rw-r--r--mm/Makefile15
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/balloon_compaction.c29
-rw-r--r--mm/bootmem_info.c4
-rw-r--r--mm/cma.c908
-rw-r--r--mm/cma.h51
-rw-r--r--mm/cma_debug.c53
-rw-r--r--mm/cma_sysfs.c20
-rw-r--r--mm/compaction.c223
-rw-r--r--mm/damon/Kconfig49
-rw-r--r--mm/damon/Makefile2
-rw-r--r--mm/damon/core.c774
-rw-r--r--mm/damon/dbgfs.c1148
-rw-r--r--mm/damon/lru_sort.c82
-rw-r--r--mm/damon/modules-common.c2
-rw-r--r--mm/damon/modules-common.h2
-rw-r--r--mm/damon/ops-common.c301
-rw-r--r--mm/damon/ops-common.h7
-rw-r--r--mm/damon/paddr.c391
-rw-r--r--mm/damon/reclaim.c78
-rw-r--r--mm/damon/stat.c267
-rw-r--r--mm/damon/sysfs-common.c2
-rw-r--r--mm/damon/sysfs-common.h18
-rw-r--r--mm/damon/sysfs-schemes.c986
-rw-r--r--mm/damon/sysfs.c740
-rw-r--r--mm/damon/tests/.kunitconfig7
-rw-r--r--mm/damon/tests/core-kunit.h94
-rw-r--r--mm/damon/tests/dbgfs-kunit.h173
-rw-r--r--mm/damon/tests/vaddr-kunit.h4
-rw-r--r--mm/damon/vaddr.c258
-rw-r--r--mm/debug.c137
-rw-r--r--mm/debug_page_alloc.c2
-rw-r--r--mm/debug_vm_pgtable.c140
-rw-r--r--mm/dmapool.c19
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/execmem.c203
-rw-r--r--mm/filemap.c375
-rw-r--r--mm/folio-compat.c14
-rw-r--r--mm/gup.c549
-rw-r--r--mm/hmm.c278
-rw-r--r--mm/huge_memory.c1331
-rw-r--r--mm/hugetlb.c1552
-rw-r--r--mm/hugetlb_cgroup.c49
-rw-r--r--mm/hugetlb_cma.c278
-rw-r--r--mm/hugetlb_cma.h57
-rw-r--r--mm/hugetlb_vmemmap.c209
-rw-r--r--mm/hugetlb_vmemmap.h23
-rw-r--r--mm/init-mm.c3
-rw-r--r--mm/internal.h352
-rw-r--r--mm/io-mapping.c29
-rw-r--r--mm/ioremap.c4
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/common.c25
-rw-r--r--mm/kasan/generic.c18
-rw-r--r--mm/kasan/hw_tags.c5
-rw-r--r--mm/kasan/init.c12
-rw-r--r--mm/kasan/kasan.h18
-rw-r--r--mm/kasan/kasan_test_c.c40
-rw-r--r--mm/kasan/report.c15
-rw-r--r--mm/kasan/shadow.c133
-rw-r--r--mm/kasan/sw_tags.c3
-rw-r--r--mm/kfence/core.c4
-rw-r--r--mm/kfence/kfence_test.c3
-rw-r--r--mm/kfence/report.c3
-rw-r--r--mm/khugepaged.c207
-rw-r--r--mm/kmemleak.c116
-rw-r--r--mm/kmsan/core.c25
-rw-r--r--mm/kmsan/hooks.c10
-rw-r--r--mm/kmsan/init.c3
-rw-r--r--mm/kmsan/instrumentation.c4
-rw-r--r--mm/kmsan/kmsan.h1
-rw-r--r--mm/kmsan/kmsan_test.c17
-rw-r--r--mm/kmsan/report.c6
-rw-r--r--mm/kmsan/shadow.c17
-rw-r--r--mm/ksm.c202
-rw-r--r--mm/list_lru.c49
-rw-r--r--mm/maccess.c3
-rw-r--r--mm/madvise.c1073
-rw-r--r--mm/mapping_dirty_helpers.c6
-rw-r--r--mm/memblock.c447
-rw-r--r--mm/memcontrol-v1.c133
-rw-r--r--mm/memcontrol-v1.h52
-rw-r--r--mm/memcontrol.c1184
-rw-r--r--mm/memfd.c195
-rw-r--r--mm/memory-failure.c90
-rw-r--r--mm/memory-tiers.c19
-rw-r--r--mm/memory.c1234
-rw-r--r--mm/memory_hotplug.c276
-rw-r--r--mm/mempolicy.c669
-rw-r--r--mm/mempool.c72
-rw-r--r--mm/memremap.c102
-rw-r--r--mm/migrate.c693
-rw-r--r--mm/migrate_device.c138
-rw-r--r--mm/mincore.c27
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mm_init.c278
-rw-r--r--mm/mmap.c911
-rw-r--r--mm/mmap_lock.c412
-rw-r--r--mm/mmu_gather.c38
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c322
-rw-r--r--mm/mremap.c1753
-rw-r--r--mm/mseal.c172
-rw-r--r--mm/nommu.c162
-rw-r--r--mm/numa.c12
-rw-r--r--mm/numa_emulation.c49
-rw-r--r--mm/numa_memblks.c30
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c142
-rw-r--r--mm/page_alloc.c1645
-rw-r--r--mm/page_counter.c8
-rw-r--r--mm/page_ext.c30
-rw-r--r--mm/page_frag_cache.c6
-rw-r--r--mm/page_idle.c17
-rw-r--r--mm/page_io.c79
-rw-r--r--mm/page_isolation.c135
-rw-r--r--mm/page_owner.c101
-rw-r--r--mm/page_table_check.c78
-rw-r--r--mm/page_vma_mapped.c21
-rw-r--r--mm/pagewalk.c90
-rw-r--r--mm/percpu-stats.c1
-rw-r--r--mm/percpu.c110
-rw-r--r--mm/pgtable-generic.c7
-rw-r--r--mm/pt_reclaim.c71
-rw-r--r--mm/ptdump.c67
-rw-r--r--mm/readahead.c105
-rw-r--r--mm/rmap.c1001
-rw-r--r--mm/rodata_test.c7
-rw-r--r--mm/secretmem.c62
-rw-r--r--mm/shmem.c850
-rw-r--r--mm/show_mem.c28
-rw-r--r--mm/shrinker_debug.c24
-rw-r--r--mm/slab.h74
-rw-r--r--mm/slab_common.c920
-rw-r--r--mm/slub.c678
-rw-r--r--mm/sparse-vmemmap.c184
-rw-r--r--mm/sparse.c112
-rw-r--r--mm/swap.c181
-rw-r--r--mm/swap.h57
-rw-r--r--mm/swap_cgroup.c239
-rw-r--r--mm/swap_slots.c353
-rw-r--r--mm/swap_state.c114
-rw-r--r--mm/swapfile.c1782
-rw-r--r--mm/truncate.c145
-rw-r--r--mm/usercopy.c18
-rw-r--r--mm/userfaultfd.c349
-rw-r--r--mm/util.c406
-rw-r--r--mm/vma.c1294
-rw-r--r--mm/vma.h275
-rw-r--r--mm/vma_exec.c161
-rw-r--r--mm/vma_init.c151
-rw-r--r--mm/vma_internal.h1
-rw-r--r--mm/vmalloc.c347
-rw-r--r--mm/vmpressure.c2
-rw-r--r--mm/vmscan.c1425
-rw-r--r--mm/vmstat.c494
-rw-r--r--mm/workingset.c69
-rw-r--r--mm/z3fold.c1447
-rw-r--r--mm/zbud.c455
-rw-r--r--mm/zpdesc.h174
-rw-r--r--mm/zpool.c105
-rw-r--r--mm/zsmalloc.c966
-rw-r--r--mm/zswap.c205
165 files changed, 24940 insertions, 16959 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 84000b016808..e443fe8cd6cf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,7 +129,6 @@ choice
prompt "Default allocator"
depends on ZSWAP
default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
- default ZSWAP_ZPOOL_DEFAULT_ZBUD
help
Selects the default allocator for the compressed cache for
swap pages.
@@ -140,21 +139,6 @@ choice
The selection made here can be overridden by using the kernel
command line 'zswap.zpool=' option.
-config ZSWAP_ZPOOL_DEFAULT_ZBUD
- bool "zbud"
- select ZBUD
- help
- Use the zbud allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
- bool "z3foldi (DEPRECATED)"
- select Z3FOLD_DEPRECATED
- help
- Use the z3fold allocator as the default allocator.
-
- Deprecated and scheduled for removal in a few cycles,
- see CONFIG_Z3FOLD_DEPRECATED.
-
config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
bool "zsmalloc"
select ZSMALLOC
@@ -165,40 +149,9 @@ endchoice
config ZSWAP_ZPOOL_DEFAULT
string
depends on ZSWAP
- default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
- default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
default ""
-config ZBUD
- tristate "2:1 compression allocator (zbud)"
- depends on ZSWAP
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to two compressed pages per physical
- page. While this design limits storage density, it has simple and
- deterministic reclaim properties that make it preferable to a higher
- density approach when reclaim will be used.
-
-config Z3FOLD_DEPRECATED
- tristate "3:1 compression allocator (z3fold) (DEPRECATED)"
- depends on ZSWAP
- help
- Deprecated and scheduled for removal in a few cycles. If you have
- a good reason for using Z3FOLD over ZSMALLOC, please contact
- linux-mm@kvack.org and the zswap maintainers.
-
- A special purpose allocator for storing compressed pages.
- It is designed to store up to three compressed pages per physical
- page. It is a ZBUD derivative so the simplicity and determinism are
- still there.
-
-config Z3FOLD
- tristate
- default y if Z3FOLD_DEPRECATED=y
- default m if Z3FOLD_DEPRECATED=m
- depends on Z3FOLD_DEPRECATED
-
config ZSMALLOC
tristate
prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
@@ -242,9 +195,13 @@ menu "Slab allocator options"
config SLUB
def_bool y
+config KVFREE_RCU_BATCHED
+ def_bool y
+ depends on !SLUB_TINY && !TINY_RCU
+
config SLUB_TINY
bool "Configure for minimal memory footprint"
- depends on EXPERT
+ depends on EXPERT && !COMPILE_TEST
select SLAB_MERGE_DEFAULT
help
Configures the slab allocator in a way to achieve minimal memory
@@ -489,6 +446,9 @@ config SPARSEMEM_VMEMMAP
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+
+config SPARSEMEM_VMEMMAP_PREINIT
+ bool
#
# Select this config option from the architecture Kconfig, if it is preferred
# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
@@ -499,6 +459,9 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
+config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
+ bool
+
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -506,6 +469,10 @@ config HAVE_GUP_FAST
depends on MMU
bool
+# Enable memblock support for scratch memory which is needed for kexec handover
+config MEMBLOCK_KHO_SCRATCH
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -550,20 +517,63 @@ menuconfig MEMORY_HOTPLUG
if MEMORY_HOTPLUG
-config MEMORY_HOTPLUG_DEFAULT_ONLINE
- bool "Online the newly added memory blocks by default"
- depends on MEMORY_HOTPLUG
+choice
+ prompt "Memory Hotplug Default Online Type"
+ default MHP_DEFAULT_ONLINE_TYPE_OFFLINE
help
+ Default memory type for hotplugged memory.
+
This option sets the default policy setting for memory hotplug
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
determines what happens to newly added memory regions. Policy setting
can always be changed at runtime.
+
+ The default is 'offline'.
+
+ Select offline to defer onlining to drivers and user policy.
+ Select auto to let the kernel choose what zones to utilize.
+ Select online_kernel to generally allow kernel usage of this memory.
+ Select online_movable to generally disallow kernel usage of this memory.
+
+ Example kernel usage would be page structs and page tables.
+
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
- Say Y here if you want all hot-plugged memory blocks to appear in
- 'online' state by default.
- Say N here if you want the default policy to keep all hot-plugged
- memory blocks in 'offline' state.
+config MHP_DEFAULT_ONLINE_TYPE_OFFLINE
+ bool "offline"
+ help
+ Hotplugged memory will not be onlined by default.
+ Choose this for systems with drivers and user policy that
+ handle onlining of hotplug memory policy.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO
+ bool "auto"
+ help
+ Select this if you want the kernel to automatically online
+ hotplugged memory into the zone it thinks is reasonable.
+ This memory may be utilized for kernel data.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL
+ bool "kernel"
+ help
+ Select this if you want the kernel to automatically online
+ hotplugged memory into a zone capable of being used for kernel
+ data. This typically means ZONE_NORMAL.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE
+ bool "movable"
+ help
+ Select this if you want the kernel to automatically online
+ hotplug memory into ZONE_MOVABLE. This memory will generally
+ not be utilized for kernel data.
+
+ This should only be used when the admin knows sufficient
+ ZONE_NORMAL memory is available to describe hotplug memory,
+ otherwise hotplug memory may fail to online. For example,
+ sufficient kernel-capable memory (ZONE_NORMAL) must be
+ available to allocate page structs to describe ZONE_MOVABLE.
+
+endchoice
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
@@ -813,11 +823,15 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config MM_ID
+ def_bool n
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
+ select MM_ID
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -872,7 +886,7 @@ config THP_SWAP
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE
help
Allow khugepaged to put read-only file-backed pages in THP.
@@ -881,8 +895,25 @@ config READ_ONLY_THP_FOR_FS
support of file THPs will be developed in the next few release
cycles.
+config NO_PAGE_MAPCOUNT
+ bool "No per-page mapcount (EXPERIMENTAL)"
+ help
+ Do not maintain per-page mapcounts for pages part of larger
+ allocations, such as transparent huge pages.
+
+ When this config option is enabled, some interfaces that relied on
+ this information will rely on less-precise per-allocation information
+ instead: for example, using the average per-page mapcount in such
+ a large allocation instead of the per-page mapcount.
+
+ EXPERIMENTAL because the impact of some changes is still unclear.
+
endif # TRANSPARENT_HUGEPAGE
+# simple helper to make the code a bit easier to read
+config PAGE_MAPCOUNT
+ def_bool !NO_PAGE_MAPCOUNT
+
#
# The architecture supports pgtable leaves that is larger than PAGE_SIZE
#
@@ -903,6 +934,13 @@ config ARCH_SUPPORTS_PUD_PFNMAP
depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#
+# Architectures that always use weak definitions for percpu
+# variables in modules should set this.
+#
+config ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ bool
+
+#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
@@ -962,6 +1000,41 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+ int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_MAX_ORDER
+ int "Page Block Order Upper Limit"
+ range 1 10 if ARCH_FORCE_MAX_ORDER = 0
+ default 10 if ARCH_FORCE_MAX_ORDER = 0
+ range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ help
+ The page block order refers to the power of two number of pages that
+ are physically contiguous and can have a migrate type associated to
+ them. The maximum size of the page block order is at least limited by
+ ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER.
+
+ This config adds a new upper limit of default page block
+ order when the page block order is required to be smaller than
+ ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER or other limits
+ (see include/linux/pageblock-flags.h for details).
+
+ Reducing pageblock order can negatively impact THP generation
+ success rate. If your workloads use THP heavily, please use this
+ option with caution.
+
+ Don't change if unsure.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
@@ -1044,9 +1117,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
register alias named "current_stack_pointer", this config can be
selected.
-config ARCH_HAS_PTE_DEVMAP
- bool
-
config ARCH_HAS_ZONE_DMA_SET
bool
@@ -1064,7 +1134,6 @@ config ZONE_DEVICE
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_PTE_DEVMAP
select XARRAY_MULTI
help
@@ -1173,10 +1242,6 @@ config KMAP_LOCAL
config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
bool
-# struct io_mapping based helper. Selected by drivers that need them
-config IO_MAPPING
- bool
-
config MEMFD_CREATE
bool "Enable memfd_create() system call" if EXPERT
@@ -1290,6 +1355,7 @@ config NUMA_MEMBLKS
config NUMA_EMU
bool "NUMA emulation"
depends on NUMA_MEMBLKS
+ depends on X86 || GENERIC_ARCH_NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1301,6 +1367,21 @@ config ARCH_HAS_USER_SHADOW_STACK
The architecture has hardware support for userspace shadow call
stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+config ARCH_SUPPORTS_PT_RECLAIM
+ def_bool n
+
+config PT_RECLAIM
+ bool "reclaim empty user page table pages"
+ default y
+ depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP
+ select MMU_GATHER_RCU_TABLE_FREE
+ help
+ Try to reclaim empty user page table pages in paths other than munmap
+ and exit_mmap path.
+
+ Note: now only empty user PTE page table pages will be reclaimed.
+
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 41a58536531d..32b65073d0cc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -186,8 +186,9 @@ config ARCH_HAS_DEBUG_WX
config DEBUG_WX
bool "Warn on W+X mappings at boot"
depends on ARCH_HAS_DEBUG_WX
+ depends on ARCH_HAS_PTDUMP
depends on MMU
- select PTDUMP_CORE
+ select PTDUMP
help
Generate a warning if any W+X mappings are found at boot.
@@ -212,18 +213,18 @@ config DEBUG_WX
If in doubt, say "Y".
-config GENERIC_PTDUMP
+config ARCH_HAS_PTDUMP
bool
-config PTDUMP_CORE
+config PTDUMP
bool
config PTDUMP_DEBUGFS
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
depends on DEBUG_FS
- depends on GENERIC_PTDUMP
- select PTDUMP_CORE
+ depends on ARCH_HAS_PTDUMP
+ select PTDUMP
help
Say Y here if you want to show the kernel pagetable layout in a
debugfs file. This information is only useful for kernel developers
diff --git a/mm/Makefile b/mm/Makefile
index dba52bb0da8a..ef54aa615d9d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
@@ -75,10 +75,13 @@ ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+ifdef CONFIG_CMA
+obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o
+endif
obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
@@ -113,9 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
-obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
-obj-$(CONFIG_Z3FOLD) += z3fold.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_NUMA) += numa.o
@@ -138,11 +139,11 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
-obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
-obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
+obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..783904d8c5ef 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- del_timer_sync(&bdi->laptop_mode_wb_timer);
+ timer_delete_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6597ebea8ae2..03c5dbabb156 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,6 +24,7 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
balloon_page_insert(b_dev_info, page);
unlock_page(page);
__count_vm_event(BALLOON_INFLATE);
+ inc_node_page_state(page, NR_BALLOON_PAGES);
}
/**
@@ -93,16 +94,12 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
if (!trylock_page(page))
continue;
- if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) &&
- PageIsolated(page)) {
- /* raced with isolation */
- unlock_page(page);
- continue;
- }
- balloon_page_delete(page);
+ list_del(&page->lru);
+ balloon_page_finalize(page);
__count_vm_event(BALLOON_DEFLATE);
list_add(&page->lru, pages);
unlock_page(page);
+ dec_node_page_state(page, NR_BALLOON_PAGES);
n_pages++;
}
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
@@ -209,6 +206,9 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+ if (!b_dev_info)
+ return false;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_del(&page->lru);
b_dev_info->isolated_pages++;
@@ -222,6 +222,10 @@ static void balloon_page_putback(struct page *page)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+ /* Isolated balloon pages cannot get deflated. */
+ if (WARN_ON_ONCE(!b_dev_info))
+ return;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
@@ -237,6 +241,10 @@ static int balloon_page_migrate(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ /* Isolated balloon pages cannot get deflated. */
+ if (WARN_ON_ONCE(!balloon))
+ return -EAGAIN;
+
return balloon->migratepage(balloon, newpage, page, mode);
}
@@ -245,6 +253,11 @@ const struct movable_operations balloon_mops = {
.isolate_page = balloon_page_isolate,
.putback_page = balloon_page_putback,
};
-EXPORT_SYMBOL_GPL(balloon_mops);
+
+static int __init balloon_init(void)
+{
+ return set_movable_ops(&balloon_mops, PGTY_offline);
+}
+core_initcall(balloon_init);
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index 95f288169a38..b0e2a9fa641f 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -88,7 +88,9 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn)
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+ if (!preinited_vmemmap_section(ms))
+ register_page_bootmem_memmap(section_nr, memmap,
+ PAGES_PER_SECTION);
usage = ms->usage;
page = virt_to_page(usage);
diff --git a/mm/cma.c b/mm/cma.c
index de5bc0c81fc2..2ffa4befb99a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -18,9 +18,11 @@
#include <linux/memblock.h>
#include <linux/err.h>
+#include <linux/list.h>
#include <linux/mm.h>
#include <linux/sizes.h>
#include <linux/slab.h>
+#include <linux/string_choices.h>
#include <linux/log2.h>
#include <linux/cma.h>
#include <linux/highmem.h>
@@ -33,11 +35,11 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static DEFINE_MUTEX(cma_mutex);
phys_addr_t cma_get_base(const struct cma *cma)
{
- return PFN_PHYS(cma->base_pfn);
+ WARN_ON_ONCE(cma->nranges != 1);
+ return PFN_PHYS(cma->ranges[0].base_pfn);
}
unsigned long cma_get_size(const struct cma *cma)
@@ -63,9 +65,10 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
* The value returned is represented in order_per_bits.
*/
static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ const struct cma_memrange *cmr,
unsigned int align_order)
{
- return (cma->base_pfn & ((1UL << align_order) - 1))
+ return (cmr->base_pfn & ((1UL << align_order) - 1))
>> cma->order_per_bit;
}
@@ -75,65 +78,123 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
- unsigned long count)
+static void cma_clear_bitmap(struct cma *cma, const struct cma_memrange *cmr,
+ unsigned long pfn, unsigned long count)
{
unsigned long bitmap_no, bitmap_count;
unsigned long flags;
- bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
+ bitmap_no = (pfn - cmr->base_pfn) >> cma->order_per_bit;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
spin_lock_irqsave(&cma->lock, flags);
- bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
+ bitmap_clear(cmr->bitmap, bitmap_no, bitmap_count);
+ cma->available_count += count;
spin_unlock_irqrestore(&cma->lock, flags);
}
-static void __init cma_activate_area(struct cma *cma)
+/*
+ * Check if a CMA area contains no ranges that intersect with
+ * multiple zones. Store the result in the flags in case
+ * this gets called more than once.
+ */
+bool cma_validate_zones(struct cma *cma)
{
- unsigned long base_pfn = cma->base_pfn, pfn;
- struct zone *zone;
-
- cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap)
- goto out_error;
+ int r;
+ unsigned long base_pfn;
+ struct cma_memrange *cmr;
+ bool valid_bit_set;
/*
- * alloc_contig_range() requires the pfn range specified to be in the
- * same zone. Simplify by forcing the entire CMA resv range to be in the
- * same zone.
+ * If already validated, return result of previous check.
+ * Either the valid or invalid bit will be set if this
+ * check has already been done. If neither is set, the
+ * check has not been performed yet.
*/
- WARN_ON_ONCE(!pfn_valid(base_pfn));
- zone = page_zone(pfn_to_page(base_pfn));
- for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
- if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
+ valid_bit_set = test_bit(CMA_ZONES_VALID, &cma->flags);
+ if (valid_bit_set || test_bit(CMA_ZONES_INVALID, &cma->flags))
+ return valid_bit_set;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ base_pfn = cmr->base_pfn;
+
+ /*
+ * alloc_contig_range() requires the pfn range specified
+ * to be in the same zone. Simplify by forcing the entire
+ * CMA resv range to be in the same zone.
+ */
+ WARN_ON_ONCE(!pfn_valid(base_pfn));
+ if (pfn_range_intersects_zones(cma->nid, base_pfn, cmr->count)) {
+ set_bit(CMA_ZONES_INVALID, &cma->flags);
+ return false;
+ }
+ }
+
+ set_bit(CMA_ZONES_VALID, &cma->flags);
+
+ return true;
+}
+
+static void __init cma_activate_area(struct cma *cma)
+{
+ unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES];
+ int allocrange, r;
+ struct cma_memrange *cmr;
+ unsigned long bitmap_count, count;
+
+ for (allocrange = 0; allocrange < cma->nranges; allocrange++) {
+ cmr = &cma->ranges[allocrange];
+ early_pfn[allocrange] = cmr->early_pfn;
+ cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr),
+ GFP_KERNEL);
+ if (!cmr->bitmap)
+ goto cleanup;
}
- for (pfn = base_pfn; pfn < base_pfn + cma->count;
- pfn += pageblock_nr_pages)
- init_cma_reserved_pageblock(pfn_to_page(pfn));
+ if (!cma_validate_zones(cma))
+ goto cleanup;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ if (early_pfn[r] != cmr->base_pfn) {
+ count = early_pfn[r] - cmr->base_pfn;
+ bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+ bitmap_set(cmr->bitmap, 0, bitmap_count);
+ }
+
+ for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
spin_lock_init(&cma->lock);
+ mutex_init(&cma->alloc_mutex);
+
#ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
spin_lock_init(&cma->mem_head_lock);
#endif
+ set_bit(CMA_ACTIVATED, &cma->flags);
return;
-not_in_zone:
- bitmap_free(cma->bitmap);
-out_error:
+cleanup:
+ for (r = 0; r < allocrange; r++)
+ bitmap_free(cma->ranges[r].bitmap);
+
/* Expose all pages to the buddy, they are useless for CMA. */
- if (!cma->reserve_pages_on_error) {
- for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
- free_reserved_page(pfn_to_page(pfn));
+ if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) {
+ for (r = 0; r < allocrange; r++) {
+ cmr = &cma->ranges[r];
+ end_pfn = cmr->base_pfn + cmr->count;
+ for (pfn = early_pfn[r]; pfn < end_pfn; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ }
}
totalcma_pages -= cma->count;
- cma->count = 0;
+ cma->available_count = cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
}
@@ -150,7 +211,44 @@ core_initcall(cma_init_reserved_areas);
void __init cma_reserve_pages_on_error(struct cma *cma)
{
- cma->reserve_pages_on_error = true;
+ set_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags);
+}
+
+static int __init cma_new_area(const char *name, phys_addr_t size,
+ unsigned int order_per_bit,
+ struct cma **res_cma)
+{
+ struct cma *cma;
+
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ cma = &cma_areas[cma_area_count];
+ cma_area_count++;
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, "%s", name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
+ cma->available_count = cma->count = size >> PAGE_SHIFT;
+ cma->order_per_bit = order_per_bit;
+ *res_cma = cma;
+ totalcma_pages += cma->count;
+
+ return 0;
+}
+
+static void __init cma_drop_area(struct cma *cma)
+{
+ totalcma_pages -= cma->count;
+ cma_area_count--;
}
/**
@@ -171,13 +269,9 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
struct cma **res_cma)
{
struct cma *cma;
+ int ret;
/* Sanity checks */
- if (cma_area_count == ARRAY_SIZE(cma_areas)) {
- pr_err("Not enough slots for CMA reserved regions!\n");
- return -ENOSPC;
- }
-
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
@@ -194,64 +288,154 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
- /*
- * Each reserved area must be initialised later, when more kernel
- * subsystems (like slab allocator) are available.
- */
- cma = &cma_areas[cma_area_count];
+ ret = cma_new_area(name, size, order_per_bit, &cma);
+ if (ret != 0)
+ return ret;
- if (name)
- snprintf(cma->name, CMA_MAX_NAME, name);
- else
- snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+ cma->ranges[0].base_pfn = PFN_DOWN(base);
+ cma->ranges[0].early_pfn = PFN_DOWN(base);
+ cma->ranges[0].count = cma->count;
+ cma->nranges = 1;
+ cma->nid = NUMA_NO_NODE;
- cma->base_pfn = PFN_DOWN(base);
- cma->count = size >> PAGE_SHIFT;
- cma->order_per_bit = order_per_bit;
*res_cma = cma;
- cma_area_count++;
- totalcma_pages += cma->count;
return 0;
}
-/**
- * cma_declare_contiguous_nid() - reserve custom contiguous area
- * @base: Base address of the reserved area optional, use 0 for any
- * @size: Size of the reserved area (in bytes),
- * @limit: End address of the reserved memory (optional, 0 for any).
- * @alignment: Alignment for the CMA area, should be power of 2 or zero
- * @order_per_bit: Order of pages represented by one bit on bitmap.
- * @fixed: hint about where to place the reserved area
- * @name: The name of the area. See function cma_init_reserved_mem()
- * @res_cma: Pointer to store the created cma region.
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * This function reserves memory from early allocator. It should be
- * called by arch specific code once the early allocator (memblock or bootmem)
- * has been activated and all other subsystems have already allocated/reserved
- * memory. This function allows to create custom reserved areas.
- *
- * If @fixed is true, reserve contiguous area at exactly @base. If false,
- * reserve in range from @base to @limit.
+/*
+ * Structure used while walking physical memory ranges and finding out
+ * which one(s) to use for a CMA area.
*/
-int __init cma_declare_contiguous_nid(phys_addr_t base,
+struct cma_init_memrange {
+ phys_addr_t base;
+ phys_addr_t size;
+ struct list_head list;
+};
+
+/*
+ * Work array used during CMA initialization.
+ */
+static struct cma_init_memrange memranges[CMA_MAX_RANGES] __initdata;
+
+static bool __init revsizecmp(struct cma_init_memrange *mlp,
+ struct cma_init_memrange *mrp)
+{
+ return mlp->size > mrp->size;
+}
+
+static bool __init basecmp(struct cma_init_memrange *mlp,
+ struct cma_init_memrange *mrp)
+{
+ return mlp->base < mrp->base;
+}
+
+/*
+ * Helper function to create sorted lists.
+ */
+static void __init list_insert_sorted(
+ struct list_head *ranges,
+ struct cma_init_memrange *mrp,
+ bool (*cmp)(struct cma_init_memrange *lh, struct cma_init_memrange *rh))
+{
+ struct list_head *mp;
+ struct cma_init_memrange *mlp;
+
+ if (list_empty(ranges))
+ list_add(&mrp->list, ranges);
+ else {
+ list_for_each(mp, ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ if (cmp(mlp, mrp))
+ break;
+ }
+ __list_add(&mrp->list, mlp->list.prev, &mlp->list);
+ }
+}
+
+static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size)
+{
+ if (IS_ENABLED(CONFIG_HIGHMEM)) {
+ phys_addr_t highmem_start = __pa(high_memory - 1) + 1;
+
+ /*
+ * If allocating at a fixed base the request region must not
+ * cross the low/high memory boundary.
+ */
+ if (base < highmem_start && base + size > highmem_start) {
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
+ return -EINVAL;
+ }
+ }
+
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size,
+ phys_addr_t align, phys_addr_t limit, int nid)
+{
+ phys_addr_t addr = 0;
+
+ /*
+ * If there is enough memory, try a bottom-up allocation first.
+ * It will place the new cma area close to the start of the node
+ * and guarantee that the compaction is moving pages out of the
+ * cma area and not into it.
+ * Avoid using first 4GB to not interfere with constrained zones
+ * like DMA/DMA32.
+ */
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if (!memblock_bottom_up() && limit >= SZ_4G + size) {
+ memblock_set_bottom_up(true);
+ addr = memblock_alloc_range_nid(size, align, SZ_4G, limit,
+ nid, true);
+ memblock_set_bottom_up(false);
+ }
+#endif
+
+ /*
+ * On systems with HIGHMEM try allocating from there before consuming
+ * memory in lower zones.
+ */
+ if (!addr && IS_ENABLED(CONFIG_HIGHMEM)) {
+ phys_addr_t highmem = __pa(high_memory - 1) + 1;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem && limit > highmem) {
+ addr = memblock_alloc_range_nid(size, align, highmem,
+ limit, nid, true);
+ limit = highmem;
+ }
+ }
+
+ if (!addr)
+ addr = memblock_alloc_range_nid(size, align, base, limit, nid,
+ true);
+
+ return addr;
+}
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, const char *name, struct cma **res_cma,
int nid)
{
phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start;
+ phys_addr_t base = *basep;
int ret;
- /*
- * We can't use __pa(high_memory) directly, since high_memory
- * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly)
- * complain. Find the boundary by adding one to the last valid
- * address.
- */
- highmem_start = __pa(high_memory - 1) + 1;
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
@@ -272,10 +456,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
/* Sanitise input arguments. */
alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
- ret = -EINVAL;
pr_err("Region at %pa must be aligned to %pa bytes\n",
&base, &alignment);
- goto err;
+ return -EINVAL;
}
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
@@ -288,16 +471,6 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
return -EINVAL;
- /*
- * If allocating at a fixed base the request region must not cross the
- * low/high memory boundary.
- */
- if (fixed && base < highmem_start && base + size > highmem_start) {
- ret = -EINVAL;
- pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
- &base, &highmem_start);
- goto err;
- }
/*
* If the limit is unspecified or above the memblock end, its effective
@@ -308,150 +481,339 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
limit = memblock_end;
if (base + size > limit) {
- ret = -EINVAL;
pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
&size, &base, &limit);
- goto err;
+ return -EINVAL;
}
/* Reserve memory */
if (fixed) {
- if (memblock_is_region_reserved(base, size) ||
- memblock_reserve(base, size) < 0) {
- ret = -EBUSY;
- goto err;
- }
+ ret = cma_fixed_reserve(base, size);
+ if (ret)
+ return ret;
} else {
- phys_addr_t addr = 0;
+ base = cma_alloc_mem(base, size, alignment, limit, nid);
+ if (!base)
+ return -ENOMEM;
/*
- * If there is enough memory, try a bottom-up allocation first.
- * It will place the new cma area close to the start of the node
- * and guarantee that the compaction is moving pages out of the
- * cma area and not into it.
- * Avoid using first 4GB to not interfere with constrained zones
- * like DMA/DMA32.
+ * kmemleak scans/reads tracked objects for pointers to other
+ * objects but this address isn't mapped and accessible
*/
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) {
- memblock_set_bottom_up(true);
- addr = memblock_alloc_range_nid(size, alignment, SZ_4G,
- limit, nid, true);
- memblock_set_bottom_up(false);
- }
-#endif
+ kmemleak_ignore_phys(base);
+ }
+
+ ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
+ if (ret) {
+ memblock_phys_free(base, size);
+ return ret;
+ }
+
+ (*res_cma)->nid = nid;
+ *basep = base;
+
+ return 0;
+}
+
+/*
+ * Create CMA areas with a total size of @total_size. A normal allocation
+ * for one area is tried first. If that fails, the biggest memblock
+ * ranges above 4G are selected, and allocated bottom up.
+ *
+ * The complexity here is not great, but this function will only be
+ * called during boot, and the lists operated on have fewer than
+ * CMA_MAX_RANGES elements (default value: 8).
+ */
+int __init cma_declare_contiguous_multi(phys_addr_t total_size,
+ phys_addr_t align, unsigned int order_per_bit,
+ const char *name, struct cma **res_cma, int nid)
+{
+ phys_addr_t start = 0, end;
+ phys_addr_t size, sizesum, sizeleft;
+ struct cma_init_memrange *mrp, *mlp, *failed;
+ struct cma_memrange *cmrp;
+ LIST_HEAD(ranges);
+ LIST_HEAD(final_ranges);
+ struct list_head *mp, *next;
+ int ret, nr = 1;
+ u64 i;
+ struct cma *cma;
+
+ /*
+ * First, try it the normal way, producing just one range.
+ */
+ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align,
+ order_per_bit, false, name, res_cma, nid);
+ if (ret != -ENOMEM)
+ goto out;
+
+ /*
+ * Couldn't find one range that fits our needs, so try multiple
+ * ranges.
+ *
+ * No need to do the alignment checks here, the call to
+ * cma_declare_contiguous_nid above would have caught
+ * any issues. With the checks, we know that:
+ *
+ * - @align is a power of 2
+ * - @align is >= pageblock alignment
+ * - @size is aligned to @align and to @order_per_bit
+ *
+ * So, as long as we create ranges that have a base
+ * aligned to @align, and a size that is aligned to
+ * both @align and @order_to_bit, things will work out.
+ */
+ nr = 0;
+ sizesum = 0;
+ failed = NULL;
+
+ ret = cma_new_area(name, total_size, order_per_bit, &cma);
+ if (ret != 0)
+ goto out;
+
+ align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES);
+ /*
+ * Create a list of ranges above 4G, largest range first.
+ */
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (upper_32_bits(start) == 0)
+ continue;
+
+ start = ALIGN(start, align);
+ if (start >= end)
+ continue;
+
+ end = ALIGN_DOWN(end, align);
+ if (end <= start)
+ continue;
+
+ size = end - start;
+ size = ALIGN_DOWN(size, (PAGE_SIZE << order_per_bit));
+ if (!size)
+ continue;
+ sizesum += size;
+
+ pr_debug("consider %016llx - %016llx\n", (u64)start, (u64)end);
/*
- * All pages in the reserved area must come from the same zone.
- * If the requested region crosses the low/high memory boundary,
- * try allocating from high memory first and fall back to low
- * memory in case of failure.
+ * If we don't yet have used the maximum number of
+ * areas, grab a new one.
+ *
+ * If we can't use anymore, see if this range is not
+ * smaller than the smallest one already recorded. If
+ * not, re-use the smallest element.
*/
- if (!addr && base < highmem_start && limit > highmem_start) {
- addr = memblock_alloc_range_nid(size, alignment,
- highmem_start, limit, nid, true);
- limit = highmem_start;
- }
-
- if (!addr) {
- addr = memblock_alloc_range_nid(size, alignment, base,
- limit, nid, true);
- if (!addr) {
- ret = -ENOMEM;
- goto err;
- }
+ if (nr < CMA_MAX_RANGES)
+ mrp = &memranges[nr++];
+ else {
+ mrp = list_last_entry(&ranges,
+ struct cma_init_memrange, list);
+ if (size < mrp->size)
+ continue;
+ list_del(&mrp->list);
+ sizesum -= mrp->size;
+ pr_debug("deleted %016llx - %016llx from the list\n",
+ (u64)mrp->base, (u64)mrp->base + size);
}
+ mrp->base = start;
+ mrp->size = size;
/*
- * kmemleak scans/reads tracked objects for pointers to other
- * objects but this address isn't mapped and accessible
+ * Now do a sorted insert.
*/
- kmemleak_ignore_phys(addr);
- base = addr;
+ list_insert_sorted(&ranges, mrp, revsizecmp);
+ pr_debug("added %016llx - %016llx to the list\n",
+ (u64)mrp->base, (u64)mrp->base + size);
+ pr_debug("total size now %llu\n", (u64)sizesum);
}
- ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
- if (ret)
- goto free_mem;
+ /*
+ * There is not enough room in the CMA_MAX_RANGES largest
+ * ranges, so bail out.
+ */
+ if (sizesum < total_size) {
+ cma_drop_area(cma);
+ ret = -ENOMEM;
+ goto out;
+ }
- pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M,
- &base, nid);
- return 0;
+ /*
+ * Found ranges that provide enough combined space.
+ * Now, sorted them by address, smallest first, because we
+ * want to mimic a bottom-up memblock allocation.
+ */
+ sizesum = 0;
+ list_for_each_safe(mp, next, &ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ list_del(mp);
+ list_insert_sorted(&final_ranges, mlp, basecmp);
+ sizesum += mlp->size;
+ if (sizesum >= total_size)
+ break;
+ }
+
+ /*
+ * Walk the final list, and add a CMA range for
+ * each range, possibly not using the last one fully.
+ */
+ nr = 0;
+ sizeleft = total_size;
+ list_for_each(mp, &final_ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ size = min(sizeleft, mlp->size);
+ if (memblock_reserve(mlp->base, size)) {
+ /*
+ * Unexpected error. Could go on to
+ * the next one, but just abort to
+ * be safe.
+ */
+ failed = mlp;
+ break;
+ }
+
+ pr_debug("created region %d: %016llx - %016llx\n",
+ nr, (u64)mlp->base, (u64)mlp->base + size);
+ cmrp = &cma->ranges[nr++];
+ cmrp->base_pfn = PHYS_PFN(mlp->base);
+ cmrp->early_pfn = cmrp->base_pfn;
+ cmrp->count = size >> PAGE_SHIFT;
+
+ sizeleft -= size;
+ if (sizeleft == 0)
+ break;
+ }
+
+ if (failed) {
+ list_for_each(mp, &final_ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ if (mlp == failed)
+ break;
+ memblock_phys_free(mlp->base, mlp->size);
+ }
+ cma_drop_area(cma);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cma->nranges = nr;
+ cma->nid = nid;
+ *res_cma = cma;
+
+out:
+ if (ret != 0)
+ pr_err("Failed to reserve %lu MiB\n",
+ (unsigned long)total_size / SZ_1M);
+ else
+ pr_info("Reserved %lu MiB in %d range%s\n",
+ (unsigned long)total_size / SZ_1M, nr, str_plural(nr));
+ return ret;
+}
+
+/**
+ * cma_declare_contiguous_nid() - reserve custom contiguous area
+ * @base: Base address of the reserved area optional, use 0 for any
+ * @size: Size of the reserved area (in bytes),
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ * @alignment: Alignment for the CMA area, should be power of 2 or zero
+ * @order_per_bit: Order of pages represented by one bit on bitmap.
+ * @fixed: hint about where to place the reserved area
+ * @name: The name of the area. See function cma_init_reserved_mem()
+ * @res_cma: Pointer to store the created cma region.
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory. This function allows to create custom reserved areas.
+ *
+ * If @fixed is true, reserve contiguous area at exactly @base. If false,
+ * reserve in range from @base to @limit.
+ */
+int __init cma_declare_contiguous_nid(phys_addr_t base,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
+{
+ int ret;
+
+ ret = __cma_declare_contiguous_nid(&base, size, limit, alignment,
+ order_per_bit, fixed, name, res_cma, nid);
+ if (ret != 0)
+ pr_err("Failed to reserve %ld MiB\n",
+ (unsigned long)size / SZ_1M);
+ else
+ pr_info("Reserved %ld MiB at %pa\n",
+ (unsigned long)size / SZ_1M, &base);
-free_mem:
- memblock_phys_free(base, size);
-err:
- pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M,
- nid);
return ret;
}
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit, nr_zero;
- unsigned long start = 0;
- unsigned long nr_part, nr_total = 0;
- unsigned long nbits = cma_bitmap_maxno(cma);
+ unsigned long start, end;
+ unsigned long nr_part;
+ unsigned long nbits;
+ int r;
+ struct cma_memrange *cmr;
spin_lock_irq(&cma->lock);
pr_info("number of available pages: ");
- for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
- if (next_zero_bit >= nbits)
- break;
- next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
- nr_zero = next_set_bit - next_zero_bit;
- nr_part = nr_zero << cma->order_per_bit;
- pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
- next_zero_bit);
- nr_total += nr_part;
- start = next_zero_bit + nr_zero;
- }
- pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+
+ nbits = cma_bitmap_maxno(cma, cmr);
+
+ pr_info("range %d: ", r);
+ for_each_clear_bitrange(start, end, cmr->bitmap, nbits) {
+ nr_part = (end - start) << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, start);
+ }
+ pr_info("\n");
+ }
+ pr_cont("=> %lu free of %lu total pages\n", cma->available_count,
+ cma->count);
spin_unlock_irq(&cma->lock);
}
-static struct page *__cma_alloc(struct cma *cma, unsigned long count,
- unsigned int align, gfp_t gfp)
+static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
+ unsigned long count, unsigned int align,
+ struct page **pagep, gfp_t gfp)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
- unsigned long i;
+ int ret = -EBUSY;
struct page *page = NULL;
- int ret = -ENOMEM;
- const char *name = cma ? cma->name : NULL;
-
- trace_cma_alloc_start(name, count, align);
-
- if (!cma || !cma->count || !cma->bitmap)
- return page;
-
- pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
- (void *)cma, cma->name, count, align);
-
- if (!count)
- return page;
mask = cma_bitmap_aligned_mask(cma, align);
- offset = cma_bitmap_aligned_offset(cma, align);
- bitmap_maxno = cma_bitmap_maxno(cma);
+ offset = cma_bitmap_aligned_offset(cma, cmr, align);
+ bitmap_maxno = cma_bitmap_maxno(cma, cmr);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- return page;
+ goto out;
for (;;) {
spin_lock_irq(&cma->lock);
- bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
+ /*
+ * If the request is larger than the available number
+ * of pages, stop right away.
+ */
+ if (count > cma->available_count) {
+ spin_unlock_irq(&cma->lock);
+ break;
+ }
+ bitmap_no = bitmap_find_next_zero_area_off(cmr->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
spin_unlock_irq(&cma->lock);
break;
}
- bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
+ bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
+ cma->available_count -= count;
/*
* It's safe to drop the lock here. We've marked this region for
* our exclusive use. If the migration fails we will take the
@@ -459,16 +821,16 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
*/
spin_unlock_irq(&cma->lock);
- pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
- mutex_lock(&cma_mutex);
- ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
- mutex_unlock(&cma_mutex);
+ pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
+ mutex_lock(&cma->alloc_mutex);
+ ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
+ mutex_unlock(&cma->alloc_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
break;
}
- cma_clear_bitmap(cma, pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
if (ret != -EBUSY)
break;
@@ -480,6 +842,38 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
+out:
+ *pagep = page;
+ return ret;
+}
+
+static struct page *__cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, gfp_t gfp)
+{
+ struct page *page = NULL;
+ int ret = -ENOMEM, r;
+ unsigned long i;
+ const char *name = cma ? cma->name : NULL;
+
+ if (!cma || !cma->count)
+ return page;
+
+ pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
+ (void *)cma, cma->name, count, align);
+
+ if (!count)
+ return page;
+
+ trace_cma_alloc_start(name, count, align);
+
+ for (r = 0; r < cma->nranges; r++) {
+ page = NULL;
+
+ ret = cma_range_alloc(cma, &cma->ranges[r], count, align,
+ &page, gfp);
+ if (ret != -EBUSY || page)
+ break;
+ }
/*
* CMA can allocate multiple page blocks, which results in different
@@ -498,7 +892,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
}
pr_debug("%s(): returned %p\n", __func__, page);
- trace_cma_alloc_finish(name, pfn, page, count, align, ret);
+ trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0,
+ page, count, align, ret);
if (page) {
count_vm_event(CMA_ALLOC_SUCCESS);
cma_sysfs_account_success_pages(cma, count);
@@ -541,20 +936,31 @@ struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
bool cma_pages_valid(struct cma *cma, const struct page *pages,
unsigned long count)
{
- unsigned long pfn;
+ unsigned long pfn, end;
+ int r;
+ struct cma_memrange *cmr;
+ bool ret;
- if (!cma || !pages)
+ if (!cma || !pages || count > cma->count)
return false;
pfn = page_to_pfn(pages);
+ ret = false;
- if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
- pr_debug("%s(page %p, count %lu)\n", __func__,
- (void *)pages, count);
- return false;
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ end = cmr->base_pfn + cmr->count;
+ if (pfn >= cmr->base_pfn && pfn < end) {
+ ret = pfn + count <= end;
+ break;
+ }
}
- return true;
+ if (!ret)
+ pr_debug("%s(page %p, count %lu)\n",
+ __func__, (void *)pages, count);
+
+ return ret;
}
/**
@@ -570,19 +976,32 @@ bool cma_pages_valid(struct cma *cma, const struct page *pages,
bool cma_release(struct cma *cma, const struct page *pages,
unsigned long count)
{
- unsigned long pfn;
+ struct cma_memrange *cmr;
+ unsigned long pfn, end_pfn;
+ int r;
+
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
if (!cma_pages_valid(cma, pages, count))
return false;
- pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
-
pfn = page_to_pfn(pages);
+ end_pfn = pfn + count;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ if (pfn >= cmr->base_pfn &&
+ pfn < (cmr->base_pfn + cmr->count)) {
+ VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count);
+ break;
+ }
+ }
- VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+ if (r == cma->nranges)
+ return false;
free_contig_range(pfn, count);
- cma_clear_bitmap(cma, pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
cma_sysfs_account_release_pages(cma, count);
trace_cma_release(cma->name, pfn, pages, count);
@@ -610,3 +1029,86 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
return 0;
}
+
+bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end)
+{
+ int r;
+ struct cma_memrange *cmr;
+ unsigned long rstart, rend;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+
+ rstart = PFN_PHYS(cmr->base_pfn);
+ rend = PFN_PHYS(cmr->base_pfn + cmr->count);
+ if (end < rstart)
+ continue;
+ if (start >= rend)
+ continue;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Very basic function to reserve memory from a CMA area that has not
+ * yet been activated. This is expected to be called early, when the
+ * system is single-threaded, so there is no locking. The alignment
+ * checking is restrictive - only pageblock-aligned areas
+ * (CMA_MIN_ALIGNMENT_BYTES) may be reserved through this function.
+ * This keeps things simple, and is enough for the current use case.
+ *
+ * The CMA bitmaps have not yet been allocated, so just start
+ * reserving from the bottom up, using a PFN to keep track
+ * of what has been reserved. Unreserving is not possible.
+ *
+ * The caller is responsible for initializing the page structures
+ * in the area properly, since this just points to memblock-allocated
+ * memory. The caller should subsequently use init_cma_pageblock to
+ * set the migrate type and CMA stats the pageblocks that were reserved.
+ *
+ * If the CMA area fails to activate later, memory obtained through
+ * this interface is not handed to the page allocator, this is
+ * the responsibility of the caller (e.g. like normal memblock-allocated
+ * memory).
+ */
+void __init *cma_reserve_early(struct cma *cma, unsigned long size)
+{
+ int r;
+ struct cma_memrange *cmr;
+ unsigned long available;
+ void *ret = NULL;
+
+ if (!cma || !cma->count)
+ return NULL;
+ /*
+ * Can only be called early in init.
+ */
+ if (test_bit(CMA_ACTIVATED, &cma->flags))
+ return NULL;
+
+ if (!IS_ALIGNED(size, CMA_MIN_ALIGNMENT_BYTES))
+ return NULL;
+
+ if (!IS_ALIGNED(size, (PAGE_SIZE << cma->order_per_bit)))
+ return NULL;
+
+ size >>= PAGE_SHIFT;
+
+ if (size > cma->available_count)
+ return NULL;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ available = cmr->count - (cmr->early_pfn - cmr->base_pfn);
+ if (size <= available) {
+ ret = phys_to_virt(PFN_PHYS(cmr->early_pfn));
+ cmr->early_pfn += size;
+ cma->available_count -= size;
+ return ret;
+ }
+ }
+
+ return ret;
+}
diff --git a/mm/cma.h b/mm/cma.h
index ad61cc6dd439..c70180c36559 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -10,18 +10,45 @@ struct cma_kobject {
struct cma *cma;
};
+/*
+ * Multi-range support. This can be useful if the size of the allocation
+ * is not expected to be larger than the alignment (like with hugetlb_cma),
+ * and the total amount of memory requested, while smaller than the total
+ * amount of memory available, is large enough that it doesn't fit in a
+ * single physical memory range because of memory holes.
+ *
+ * Fields:
+ * @base_pfn: physical address of range
+ * @early_pfn: first PFN not reserved through cma_reserve_early
+ * @count: size of range
+ * @bitmap: bitmap of allocated (1 << order_per_bit)-sized chunks.
+ */
+struct cma_memrange {
+ unsigned long base_pfn;
+ unsigned long count;
+ union {
+ unsigned long early_pfn;
+ unsigned long *bitmap;
+ };
+#ifdef CONFIG_CMA_DEBUGFS
+ struct debugfs_u32_array dfs_bitmap;
+#endif
+};
+#define CMA_MAX_RANGES 8
+
struct cma {
- unsigned long base_pfn;
unsigned long count;
- unsigned long *bitmap;
+ unsigned long available_count;
unsigned int order_per_bit; /* Order of pages represented by one bit */
spinlock_t lock;
+ struct mutex alloc_mutex;
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
- struct debugfs_u32_array dfs_bitmap;
#endif
char name[CMA_MAX_NAME];
+ int nranges;
+ struct cma_memrange ranges[CMA_MAX_RANGES];
#ifdef CONFIG_CMA_SYSFS
/* the number of CMA page successful allocations */
atomic64_t nr_pages_succeeded;
@@ -32,15 +59,25 @@ struct cma {
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
- bool reserve_pages_on_error;
+ unsigned long flags;
+ /* NUMA node (NUMA_NO_NODE if unspecified) */
+ int nid;
+};
+
+enum cma_flags {
+ CMA_RESERVE_PAGES_ON_ERROR,
+ CMA_ZONES_VALID,
+ CMA_ZONES_INVALID,
+ CMA_ACTIVATED,
};
extern struct cma cma_areas[MAX_CMA_AREAS];
-extern unsigned cma_area_count;
+extern unsigned int cma_area_count;
-static inline unsigned long cma_bitmap_maxno(struct cma *cma)
+static inline unsigned long cma_bitmap_maxno(struct cma *cma,
+ struct cma_memrange *cmr)
{
- return cma->count >> cma->order_per_bit;
+ return cmr->count >> cma->order_per_bit;
}
#ifdef CONFIG_CMA_SYSFS
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 602fff89b15f..8c7d7f8e8fbd 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -34,13 +34,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
static int cma_used_get(void *data, u64 *val)
{
struct cma *cma = data;
- unsigned long used;
spin_lock_irq(&cma->lock);
- /* pages counter is smaller than sizeof(int) */
- used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
+ *val = cma->count - cma->available_count;
spin_unlock_irq(&cma->lock);
- *val = (u64)used << cma->order_per_bit;
return 0;
}
@@ -49,17 +46,18 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
static int cma_maxchunk_get(void *data, u64 *val)
{
struct cma *cma = data;
+ struct cma_memrange *cmr;
unsigned long maxchunk = 0;
- unsigned long start, end = 0;
- unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
+ unsigned long start, end;
+ unsigned long bitmap_maxno;
+ int r;
spin_lock_irq(&cma->lock);
- for (;;) {
- start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= bitmap_maxno)
- break;
- end = find_next_bit(cma->bitmap, bitmap_maxno, start);
- maxchunk = max(end - start, maxchunk);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ bitmap_maxno = cma_bitmap_maxno(cma, cmr);
+ for_each_clear_bitrange(start, end, cmr->bitmap, bitmap_maxno)
+ maxchunk = max(end - start, maxchunk);
}
spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
@@ -162,24 +160,41 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
- struct dentry *tmp;
+ struct dentry *tmp, *dir, *rangedir;
+ int r;
+ char rdirname[12];
+ struct cma_memrange *cmr;
tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
- debugfs_create_file("base_pfn", 0444, tmp,
- &cma->base_pfn, &cma_debugfs_fops);
debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops);
debugfs_create_file("order_per_bit", 0444, tmp,
&cma->order_per_bit, &cma_debugfs_fops);
debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
- cma->dfs_bitmap.array = (u32 *)cma->bitmap;
- cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma),
- BITS_PER_BYTE * sizeof(u32));
- debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap);
+ rangedir = debugfs_create_dir("ranges", tmp);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ snprintf(rdirname, sizeof(rdirname), "%d", r);
+ dir = debugfs_create_dir(rdirname, rangedir);
+ debugfs_create_file("base_pfn", 0444, dir,
+ &cmr->base_pfn, &cma_debugfs_fops);
+ cmr->dfs_bitmap.array = (u32 *)cmr->bitmap;
+ cmr->dfs_bitmap.n_elements =
+ DIV_ROUND_UP(cma_bitmap_maxno(cma, cmr),
+ BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", 0444, dir,
+ &cmr->dfs_bitmap);
+ }
+
+ /*
+ * Backward compatible symlinks to range 0 for base_pfn and bitmap.
+ */
+ debugfs_create_symlink("base_pfn", tmp, "ranges/0/base_pfn");
+ debugfs_create_symlink("bitmap", tmp, "ranges/0/bitmap");
}
static int __init cma_debugfs_init(void)
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index f50db3973171..97acd3e5a6a5 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -62,6 +62,24 @@ static ssize_t release_pages_success_show(struct kobject *kobj,
}
CMA_ATTR_RO(release_pages_success);
+static ssize_t total_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%lu\n", cma->count);
+}
+CMA_ATTR_RO(total_pages);
+
+static ssize_t available_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%lu\n", cma->available_count);
+}
+CMA_ATTR_RO(available_pages);
+
static void cma_kobj_release(struct kobject *kobj)
{
struct cma *cma = cma_from_kobj(kobj);
@@ -75,6 +93,8 @@ static struct attribute *cma_attrs[] = {
&alloc_pages_success_attr.attr,
&alloc_pages_fail_attr.attr,
&release_pages_success_attr.attr,
+ &total_pages_attr.attr,
+ &available_pages_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(cma);
diff --git a/mm/compaction.c b/mm/compaction.c
index 77dbb9022b47..bf021b31c7ec 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; }
static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags)
{
post_alloc_hook(page, order, __GFP_MOVABLE);
+ set_page_refcounted(page);
return page;
}
#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__))
@@ -113,39 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages)
}
#ifdef CONFIG_COMPACTION
-bool PageMovable(struct page *page)
-{
- const struct movable_operations *mops;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (!__PageMovable(page))
- return false;
-
- mops = page_movable_ops(page);
- if (mops)
- return true;
-
- return false;
-}
-
-void __SetPageMovable(struct page *page, const struct movable_operations *mops)
-{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
- page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
-}
-EXPORT_SYMBOL(__SetPageMovable);
-
-void __ClearPageMovable(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageMovable(page), page);
- /*
- * This page still has the type of a movable page, but it's
- * actually not movable any more.
- */
- page->mapping = (void *)PAGE_MAPPING_MOVABLE;
-}
-EXPORT_SYMBOL(__ClearPageMovable);
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -980,13 +948,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page)) {
+ const unsigned int order = compound_order(page);
/*
* skip hugetlbfs if we are not compacting for pages
* bigger than its order. THPs and other compound pages
* are handled below.
*/
if (!cc->alloc_contig) {
- const unsigned int order = compound_order(page);
if (order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << order) - 1;
@@ -1000,27 +968,27 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+ folio = page_folio(page);
+ ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages);
/*
- * Fail isolation in case isolate_or_dissolve_huge_page()
+ * Fail isolation in case isolate_or_dissolve_huge_folio()
* reports an error. In case of -ENOMEM, abort right away.
*/
if (ret < 0) {
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
goto isolate_fail;
}
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- folio = page_folio(page);
low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
@@ -1081,18 +1049,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* Skip any other type of page
*/
if (!PageLRU(page)) {
- /*
- * __PageMovable can return false positive so we need
- * to verify it under page_lock.
- */
- if (unlikely(__PageMovable(page)) &&
- !PageIsolated(page)) {
+ /* Isolation code will deal with any races. */
+ if (unlikely(page_has_movable_ops(page)) &&
+ !PageMovableOpsIsolated(page)) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
- if (isolate_movable_page(page, mode)) {
+ if (isolate_movable_ops_page(page, mode)) {
folio = page_folio(page);
goto isolate_success;
}
@@ -1869,6 +1834,7 @@ again:
dst = (struct folio *)freepage;
post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
+ set_page_refcounted(&dst->page);
if (order)
prep_compound_page(&dst->page, order);
cc->nr_freepages -= 1 << order;
@@ -2247,15 +2213,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
static unsigned int fragmentation_score_wmark(bool low)
{
- unsigned int wmark_low;
+ unsigned int wmark_low, leeway;
- /*
- * Cap the low watermark to avoid excessive compaction
- * activity in case a user sets the proactiveness tunable
- * close to 100 (maximum).
- */
- wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
- return low ? wmark_low : min(wmark_low + 10, 100U);
+ wmark_low = 100U - sysctl_compaction_proactiveness;
+ leeway = min(10U, wmark_low / 2);
+ return low ? wmark_low : min(wmark_low + leeway, 100U);
}
static bool should_proactive_compact_node(pg_data_t *pgdat)
@@ -2326,11 +2288,26 @@ static enum compact_result __compact_finished(struct compact_control *cc)
if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
+ /*
+ * When defrag_mode is enabled, make kcompactd target
+ * watermarks in whole pageblocks. Because they can be stolen
+ * without polluting, no further fallback checks are needed.
+ */
+ if (defrag_mode && !cc->direct_compaction) {
+ if (__zone_watermark_ok(cc->zone, cc->order,
+ high_wmark_pages(cc->zone),
+ cc->highest_zoneidx, cc->alloc_flags,
+ zone_page_state(cc->zone,
+ NR_FREE_PAGES_BLOCKS)))
+ return COMPACT_SUCCESS;
+
+ return COMPACT_CONTINUE;
+ }
+
/* Direct compactor: Is a suitable page free? */
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &cc->zone->free_area[order];
- bool can_steal;
/* Job done if page is free of the right migratetype */
if (!free_area_empty(area, migratetype))
@@ -2346,8 +2323,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
- if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
+ if (find_suitable_fallback(area, order, migratetype, true) >= 0)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
@@ -2379,40 +2355,42 @@ static enum compact_result compact_finished(struct compact_control *cc)
}
static bool __compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx,
- unsigned long wmark_target)
+ unsigned long watermark, int highest_zoneidx,
+ unsigned long free_pages)
{
- unsigned long watermark;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
- * watermark and alloc_flags have to match, or be more pessimistic than
- * the check in __isolate_free_page(). We don't use the direct
- * compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's highest_zoneidx
- * to skip over zones where lowmem reserves would prevent allocation
- * even if compaction succeeds.
- * For costly orders, we require low watermark instead of min for
- * compaction to proceed to increase its chances.
+ * watermark have to match, or be more pessimistic than the check in
+ * __isolate_free_page().
+ *
+ * For costly orders, we require a higher watermark for compaction to
+ * proceed to increase its chances.
+ *
+ * We use the direct compactor's highest_zoneidx to skip over zones
+ * where lowmem reserves would prevent allocation even if compaction
+ * succeeds.
+ *
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * suitable migration targets.
*/
- watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
- low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ watermark += low_wmark_pages(zone) - min_wmark_pages(zone);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ ALLOC_CMA, free_pages);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, unsigned long watermark,
+ int highest_zoneidx)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, watermark, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2450,6 +2428,7 @@ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
return suitable;
}
+/* Used by direct reclaimers */
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
int alloc_flags)
{
@@ -2472,8 +2451,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
- if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ if (__compaction_suitable(zone, order, min_wmark_pages(zone),
+ ac->highest_zoneidx, available))
return true;
}
@@ -2489,16 +2468,40 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
static enum compact_result
compaction_suit_allocation_order(struct zone *zone, unsigned int order,
- int highest_zoneidx, unsigned int alloc_flags)
+ int highest_zoneidx, unsigned int alloc_flags,
+ bool async, bool kcompactd)
{
+ unsigned long free_pages;
unsigned long watermark;
+ if (kcompactd && defrag_mode)
+ free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ else
+ free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
- if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
- alloc_flags))
+ if (__zone_watermark_ok(zone, order, watermark, highest_zoneidx,
+ alloc_flags, free_pages))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ /*
+ * For unmovable allocations (without ALLOC_CMA), check if there is enough
+ * free memory in the non-CMA pageblocks. Otherwise compaction could form
+ * the high-order page in CMA pageblocks, which would not help the
+ * allocation to succeed. However, limit the check to costly order async
+ * compaction (such as opportunistic THP attempts) because there is the
+ * possibility that compaction would migrate pages from non-CMA to CMA
+ * pageblock.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && async &&
+ !(alloc_flags & ALLOC_CMA)) {
+ if (!__zone_watermark_ok(zone, 0, watermark + compact_gap(order),
+ highest_zoneidx, 0,
+ zone_page_state(zone, NR_FREE_PAGES)))
+ return COMPACT_SKIPPED;
+ }
+
+ if (!compaction_suitable(zone, order, watermark, highest_zoneidx))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
@@ -2533,7 +2536,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
if (!is_via_compact_memory(cc->order)) {
ret = compaction_suit_allocation_order(cc->zone, cc->order,
cc->highest_zoneidx,
- cc->alloc_flags);
+ cc->alloc_flags,
+ cc->mode == MIGRATE_ASYNC,
+ !cc->direct_compaction);
if (ret != COMPACT_CONTINUE)
return ret;
}
@@ -3027,6 +3032,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
struct zone *zone;
enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
enum compact_result ret;
+ unsigned int alloc_flags = defrag_mode ?
+ ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN;
for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
@@ -3036,7 +3043,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
ret = compaction_suit_allocation_order(zone,
pgdat->kcompactd_max_order,
- highest_zoneidx, ALLOC_WMARK_MIN);
+ highest_zoneidx, alloc_flags,
+ false, true);
if (ret == COMPACT_CONTINUE)
return true;
}
@@ -3059,6 +3067,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
+ .alloc_flags = defrag_mode ? ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN,
};
enum compact_result ret;
@@ -3077,7 +3086,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
continue;
ret = compaction_suit_allocation_order(zone,
- cc.order, zoneid, ALLOC_WMARK_MIN);
+ cc.order, zoneid, cc.alloc_flags,
+ false, true);
if (ret != COMPACT_CONTINUE)
continue;
@@ -3155,15 +3165,9 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t *)p;
- struct task_struct *tsk = current;
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
-
current->flags |= PF_KCOMPACTD;
set_freezable();
@@ -3237,10 +3241,12 @@ void __meminit kcompactd_run(int nid)
if (pgdat->kcompactd)
return;
- pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
pgdat->kcompactd = NULL;
+ } else {
+ wake_up_process(pgdat->kcompactd);
}
}
@@ -3258,30 +3264,6 @@ void __meminit kcompactd_stop(int nid)
}
}
-/*
- * It's optimal to keep kcompactd on the same CPUs as their memory, but
- * not required for correctness. So if the last cpu in a node goes
- * away, we get changed to run anywhere: as the first one comes back,
- * restore their cpu bindings.
- */
-static int kcompactd_cpu_online(unsigned int cpu)
-{
- int nid;
-
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
-
- mask = cpumask_of_node(pgdat->node_id);
-
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- if (pgdat->kcompactd)
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
- }
- return 0;
-}
-
static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
int write, void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3301,7 +3283,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
return ret;
}
-static struct ctl_table vm_compaction[] = {
+static const struct ctl_table vm_compaction[] = {
{
.procname = "compact_memory",
.data = &sysctl_compact_memory,
@@ -3341,15 +3323,6 @@ static struct ctl_table vm_compaction[] = {
static int __init kcompactd_init(void)
{
int nid;
- int ret;
-
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "mm/compaction:online",
- kcompactd_cpu_online, NULL);
- if (ret < 0) {
- pr_err("kcompactd: failed to register hotplug callbacks.\n");
- return ret;
- }
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index d0357f3e9372..b3171f9406c1 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -28,6 +28,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@@ -36,6 +37,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
+ default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
@@ -71,36 +74,6 @@ config DAMON_SYSFS_KUNIT_TEST
If unsure, say N.
-config DAMON_DBGFS_DEPRECATED
- bool "DAMON debugfs interface (DEPRECATED!)"
- depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
- help
- This builds the debugfs interface for DAMON. The user space admins
- can use the interface for arbitrary data access monitoring.
-
- If unsure, say N.
-
- This is deprecated, so users should move to the sysfs interface
- (DAMON_SYSFS). If you depend on this and cannot move, please report
- your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
-
-config DAMON_DBGFS
- bool
- default y
- depends on DAMON_DBGFS_DEPRECATED
-
-config DAMON_DBGFS_KUNIT_TEST
- bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
- depends on DAMON_DBGFS && KUNIT=y
- default KUNIT_ALL_TESTS
- help
- This builds the DAMON debugfs interface Kunit test suite.
-
- For more information on KUnit and unit tests in general, please refer
- to the KUnit documentation.
-
- If unsure, say N.
-
config DAMON_RECLAIM
bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
depends on DAMON_PADDR
@@ -121,4 +94,20 @@ config DAMON_LRU_SORT
protect frequently accessed (hot) pages while rarely accessed (cold)
pages reclaimed first under memory pressure.
+config DAMON_STAT
+ bool "Build data access monitoring stat (DAMON_STAT)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based access monitoring statistics subsystem.
+ It runs DAMON and expose access monitoring results in simple stat
+ metrics.
+
+config DAMON_STAT_ENABLED_DEFAULT
+ bool "Enable DAMON_STAT by default"
+ depends on DAMON_PADDR
+ default DAMON_STAT
+ help
+ Whether to enable DAMON_STAT by default. Users can disable it in
+ boot or runtime using its 'enabled' parameter.
+
endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f7add3f4aa79..d8d6bf5f8bff 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -4,6 +4,6 @@ obj-y := core.o
obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
-obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
+obj-$(CONFIG_DAMON_STAT) += modules-common.o stat.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 0776452a1abb..533c1c2d72f2 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -14,6 +14,7 @@
#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -75,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops)
if (ops->id >= NR_DAMON_OPS)
return -EINVAL;
+
mutex_lock(&damon_ops_lock);
/* Fail for already registered ops */
- if (__damon_is_registered_ops(ops->id)) {
+ if (__damon_is_registered_ops(ops->id))
err = -EINVAL;
- goto out;
- }
- damon_registered_ops[ops->id] = *ops;
-out:
+ else
+ damon_registered_ops[ops->id] = *ops;
mutex_unlock(&damon_ops_lock);
return err;
}
@@ -266,7 +266,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
}
struct damos_filter *damos_new_filter(enum damos_filter_type type,
- bool matching)
+ bool matching, bool allow)
{
struct damos_filter *filter;
@@ -275,13 +275,36 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
return NULL;
filter->type = type;
filter->matching = matching;
+ filter->allow = allow;
INIT_LIST_HEAD(&filter->list);
return filter;
}
+/**
+ * damos_filter_for_ops() - Return if the filter is ops-hndled one.
+ * @type: type of the filter.
+ *
+ * Return: true if the filter of @type needs to be handled by ops layer, false
+ * otherwise.
+ */
+bool damos_filter_for_ops(enum damos_filter_type type)
+{
+ switch (type) {
+ case DAMOS_FILTER_TYPE_ADDR:
+ case DAMOS_FILTER_TYPE_TARGET:
+ return false;
+ default:
+ break;
+ }
+ return true;
+}
+
void damos_add_filter(struct damos *s, struct damos_filter *f)
{
- list_add_tail(&f->list, &s->filters);
+ if (damos_filter_for_ops(f->type))
+ list_add_tail(&f->list, &s->ops_filters);
+ else
+ list_add_tail(&f->list, &s->filters);
}
static void damos_del_filter(struct damos_filter *f)
@@ -371,7 +394,9 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
* or damon_attrs are updated.
*/
scheme->next_apply_sis = 0;
+ scheme->walk_completed = false;
INIT_LIST_HEAD(&scheme->filters);
+ INIT_LIST_HEAD(&scheme->ops_filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -382,6 +407,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
+ scheme->migrate_dests = (struct damos_migrate_dests){};
scheme->target_nid = target_nid;
return scheme;
@@ -424,6 +450,12 @@ void damon_destroy_scheme(struct damos *s)
damos_for_each_filter_safe(f, next, s)
damos_destroy_filter(f);
+
+ damos_for_each_ops_filter_safe(f, next, s)
+ damos_destroy_filter(f);
+
+ kfree(s->migrate_dests.node_id_arr);
+ kfree(s->migrate_dests.weight_arr);
damon_del_scheme(s);
damon_free_scheme(s);
}
@@ -473,8 +505,12 @@ void damon_free_target(struct damon_target *t)
kfree(t);
}
-void damon_destroy_target(struct damon_target *t)
+void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx)
{
+
+ if (ctx && ctx->ops.cleanup_target)
+ ctx->ops.cleanup_target(t);
+
damon_del_target(t);
damon_free_target(t);
}
@@ -499,11 +535,14 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
ctx->passed_sample_intervals = 0;
- /* These will be set from kdamond_init_intervals_sis() */
+ /* These will be set from kdamond_init_ctx() */
ctx->next_aggregation_sis = 0;
ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
+ INIT_LIST_HEAD(&ctx->call_controls);
+ mutex_init(&ctx->call_controls_lock);
+ mutex_init(&ctx->walk_control_lock);
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
@@ -518,13 +557,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next_t;
- if (ctx->ops.cleanup) {
- ctx->ops.cleanup(ctx);
- return;
- }
-
damon_for_each_target_safe(t, next_t, ctx)
- damon_destroy_target(t);
+ damon_destroy_target(t, ctx);
}
void damon_destroy_ctx(struct damon_ctx *ctx)
@@ -575,11 +609,25 @@ static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
}
static void damon_update_monitoring_result(struct damon_region *r,
- struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
-{
- r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
- old_attrs, new_attrs);
- r->nr_accesses_bp = r->nr_accesses * 10000;
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs,
+ bool aggregating)
+{
+ if (!aggregating) {
+ r->nr_accesses = damon_nr_accesses_for_new_attrs(
+ r->nr_accesses, old_attrs, new_attrs);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+ } else {
+ /*
+ * if this is called in the middle of the aggregation, reset
+ * the aggregations we made so far for this aggregation
+ * interval. In other words, make the status like
+ * kdamond_reset_aggregated() is called.
+ */
+ r->last_nr_accesses = damon_nr_accesses_for_new_attrs(
+ r->last_nr_accesses, old_attrs, new_attrs);
+ r->nr_accesses_bp = r->last_nr_accesses * 10000;
+ r->nr_accesses = 0;
+ }
r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
}
@@ -592,7 +640,7 @@ static void damon_update_monitoring_result(struct damon_region *r,
* ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs.
*/
static void damon_update_monitoring_results(struct damon_ctx *ctx,
- struct damon_attrs *new_attrs)
+ struct damon_attrs *new_attrs, bool aggregating)
{
struct damon_attrs *old_attrs = &ctx->attrs;
struct damon_target *t;
@@ -607,7 +655,26 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
damon_for_each_target(t, ctx)
damon_for_each_region(r, t)
damon_update_monitoring_result(
- r, old_attrs, new_attrs);
+ r, old_attrs, new_attrs, aggregating);
+}
+
+/*
+ * damon_valid_intervals_goal() - return if the intervals goal of @attrs is
+ * valid.
+ */
+static bool damon_valid_intervals_goal(struct damon_attrs *attrs)
+{
+ struct damon_intervals_goal *goal = &attrs->intervals_goal;
+
+ /* tuning is disabled */
+ if (!goal->aggrs)
+ return true;
+ if (goal->min_sample_us > goal->max_sample_us)
+ return false;
+ if (attrs->sample_interval < goal->min_sample_us ||
+ goal->max_sample_us < attrs->sample_interval)
+ return false;
+ return true;
}
/**
@@ -615,10 +682,8 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
* @ctx: monitoring context
* @attrs: monitoring attributes
*
- * This function should be called while the kdamond is not running, or an
- * access check results aggregation is not ongoing (e.g., from
- * &struct damon_callback->after_aggregation or
- * &struct damon_callback->after_wmarks_check callbacks).
+ * This function should be called while the kdamond is not running, an access
+ * check results aggregation is not ongoing (e.g., from damon_call().
*
* Every time interval is in micro-seconds.
*
@@ -629,6 +694,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
unsigned long sample_interval = attrs->sample_interval ?
attrs->sample_interval : 1;
struct damos *s;
+ bool aggregating = ctx->passed_sample_intervals <
+ ctx->next_aggregation_sis;
+
+ if (!damon_valid_intervals_goal(attrs))
+ return -EINVAL;
if (attrs->min_nr_regions < 3)
return -EINVAL;
@@ -637,12 +707,16 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
if (attrs->sample_interval > attrs->aggr_interval)
return -EINVAL;
+ /* calls from core-external doesn't set this. */
+ if (!attrs->aggr_samples)
+ attrs->aggr_samples = attrs->aggr_interval / sample_interval;
+
ctx->next_aggregation_sis = ctx->passed_sample_intervals +
attrs->aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->passed_sample_intervals +
attrs->ops_update_interval / sample_interval;
- damon_update_monitoring_results(ctx, attrs);
+ damon_update_monitoring_results(ctx, attrs, aggregating);
ctx->attrs = *attrs;
damon_for_each_scheme(s, ctx)
@@ -685,6 +759,19 @@ static struct damos_quota_goal *damos_nth_quota_goal(
return NULL;
}
+static void damos_commit_quota_goal_union(
+ struct damos_quota_goal *dst, struct damos_quota_goal *src)
+{
+ switch (dst->metric) {
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ dst->nid = src->nid;
+ break;
+ default:
+ break;
+ }
+}
+
static void damos_commit_quota_goal(
struct damos_quota_goal *dst, struct damos_quota_goal *src)
{
@@ -693,6 +780,7 @@ static void damos_commit_quota_goal(
if (dst->metric == DAMOS_QUOTA_USER_INPUT)
dst->current_value = src->current_value;
/* keep last_psi_total as is, since it will be updated in next cycle */
+ damos_commit_quota_goal_union(dst, src);
}
/**
@@ -705,7 +793,7 @@ static void damos_commit_quota_goal(
* DAMON contexts, instead of manual in-place updates.
*
* This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
*/
int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
{
@@ -726,6 +814,7 @@ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
src_goal->metric, src_goal->target_value);
if (!new_goal)
return -ENOMEM;
+ damos_commit_quota_goal(new_goal, src_goal);
damos_add_quota_goal(dst, new_goal);
}
return 0;
@@ -759,6 +848,18 @@ static struct damos_filter *damos_nth_filter(int n, struct damos *s)
return NULL;
}
+static struct damos_filter *damos_nth_ops_filter(int n, struct damos *s)
+{
+ struct damos_filter *filter;
+ int i = 0;
+
+ damos_for_each_ops_filter(filter, s) {
+ if (i++ == n)
+ return filter;
+ }
+ return NULL;
+}
+
static void damos_commit_filter_arg(
struct damos_filter *dst, struct damos_filter *src)
{
@@ -772,6 +873,9 @@ static void damos_commit_filter_arg(
case DAMOS_FILTER_TYPE_TARGET:
dst->target_idx = src->target_idx;
break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ dst->sz_range = src->sz_range;
+ break;
default:
break;
}
@@ -782,10 +886,11 @@ static void damos_commit_filter(
{
dst->type = src->type;
dst->matching = src->matching;
+ dst->allow = src->allow;
damos_commit_filter_arg(dst, src);
}
-static int damos_commit_filters(struct damos *dst, struct damos *src)
+static int damos_commit_core_filters(struct damos *dst, struct damos *src)
{
struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
int i = 0, j = 0;
@@ -803,7 +908,36 @@ static int damos_commit_filters(struct damos *dst, struct damos *src)
continue;
new_filter = damos_new_filter(
- src_filter->type, src_filter->matching);
+ src_filter->type, src_filter->matching,
+ src_filter->allow);
+ if (!new_filter)
+ return -ENOMEM;
+ damos_commit_filter_arg(new_filter, src_filter);
+ damos_add_filter(dst, new_filter);
+ }
+ return 0;
+}
+
+static int damos_commit_ops_filters(struct damos *dst, struct damos *src)
+{
+ struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
+ int i = 0, j = 0;
+
+ damos_for_each_ops_filter_safe(dst_filter, next, dst) {
+ src_filter = damos_nth_ops_filter(i++, src);
+ if (src_filter)
+ damos_commit_filter(dst_filter, src_filter);
+ else
+ damos_destroy_filter(dst_filter);
+ }
+
+ damos_for_each_ops_filter_safe(src_filter, next, src) {
+ if (j++ < i)
+ continue;
+
+ new_filter = damos_new_filter(
+ src_filter->type, src_filter->matching,
+ src_filter->allow);
if (!new_filter)
return -ENOMEM;
damos_commit_filter_arg(new_filter, src_filter);
@@ -812,6 +946,81 @@ static int damos_commit_filters(struct damos *dst, struct damos *src)
return 0;
}
+/**
+ * damos_filters_default_reject() - decide whether to reject memory that didn't
+ * match with any given filter.
+ * @filters: Given DAMOS filters of a group.
+ */
+static bool damos_filters_default_reject(struct list_head *filters)
+{
+ struct damos_filter *last_filter;
+
+ if (list_empty(filters))
+ return false;
+ last_filter = list_last_entry(filters, struct damos_filter, list);
+ return last_filter->allow;
+}
+
+static void damos_set_filters_default_reject(struct damos *s)
+{
+ if (!list_empty(&s->ops_filters))
+ s->core_filters_default_reject = false;
+ else
+ s->core_filters_default_reject =
+ damos_filters_default_reject(&s->filters);
+ s->ops_filters_default_reject =
+ damos_filters_default_reject(&s->ops_filters);
+}
+
+static int damos_commit_dests(struct damos *dst, struct damos *src)
+{
+ struct damos_migrate_dests *dst_dests, *src_dests;
+
+ dst_dests = &dst->migrate_dests;
+ src_dests = &src->migrate_dests;
+
+ if (dst_dests->nr_dests != src_dests->nr_dests) {
+ kfree(dst_dests->node_id_arr);
+ kfree(dst_dests->weight_arr);
+
+ dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests,
+ sizeof(*dst_dests->node_id_arr), GFP_KERNEL);
+ if (!dst_dests->node_id_arr) {
+ dst_dests->weight_arr = NULL;
+ return -ENOMEM;
+ }
+
+ dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests,
+ sizeof(*dst_dests->weight_arr), GFP_KERNEL);
+ if (!dst_dests->weight_arr) {
+ /* ->node_id_arr will be freed by scheme destruction */
+ return -ENOMEM;
+ }
+ }
+
+ dst_dests->nr_dests = src_dests->nr_dests;
+ for (int i = 0; i < src_dests->nr_dests; i++) {
+ dst_dests->node_id_arr[i] = src_dests->node_id_arr[i];
+ dst_dests->weight_arr[i] = src_dests->weight_arr[i];
+ }
+
+ return 0;
+}
+
+static int damos_commit_filters(struct damos *dst, struct damos *src)
+{
+ int err;
+
+ err = damos_commit_core_filters(dst, src);
+ if (err)
+ return err;
+ err = damos_commit_ops_filters(dst, src);
+ if (err)
+ return err;
+ damos_set_filters_default_reject(dst);
+ return 0;
+}
+
static struct damos *damon_nth_scheme(int n, struct damon_ctx *ctx)
{
struct damos *s;
@@ -837,6 +1046,11 @@ static int damos_commit(struct damos *dst, struct damos *src)
return err;
dst->wmarks = src->wmarks;
+ dst->target_nid = src->target_nid;
+
+ err = damos_commit_dests(dst, src);
+ if (err)
+ return err;
err = damos_commit_filters(dst, src);
return err;
@@ -952,9 +1166,15 @@ static int damon_commit_targets(
if (err)
return err;
} else {
- if (damon_target_has_pid(dst))
- put_pid(dst_target->pid);
- damon_destroy_target(dst_target);
+ struct damos *s;
+
+ damon_destroy_target(dst_target, dst);
+ damon_for_each_scheme(s, dst) {
+ if (s->quota.charge_target_from == dst_target) {
+ s->quota.charge_target_from = NULL;
+ s->quota.charge_addr_from = 0;
+ }
+ }
}
}
@@ -967,7 +1187,7 @@ static int damon_commit_targets(
err = damon_commit_target(new_target, false,
src_target, damon_target_has_pid(src));
if (err) {
- damon_destroy_target(new_target);
+ damon_destroy_target(new_target, NULL);
return err;
}
damon_add_target(dst, new_target);
@@ -986,7 +1206,7 @@ static int damon_commit_targets(
* in-place updates.
*
* This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
*/
int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
@@ -1162,6 +1382,114 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
return err;
}
+/**
+ * damon_is_running() - Returns if a given DAMON context is running.
+ * @ctx: The DAMON context to see if running.
+ *
+ * Return: true if @ctx is running, false otherwise.
+ */
+bool damon_is_running(struct damon_ctx *ctx)
+{
+ bool running;
+
+ mutex_lock(&ctx->kdamond_lock);
+ running = ctx->kdamond != NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+ return running;
+}
+
+/**
+ * damon_call() - Invoke a given function on DAMON worker thread (kdamond).
+ * @ctx: DAMON context to call the function for.
+ * @control: Control variable of the call request.
+ *
+ * Ask DAMON worker thread (kdamond) of @ctx to call a function with an
+ * argument data that respectively passed via &damon_call_control->fn and
+ * &damon_call_control->data of @control. If &damon_call_control->repeat of
+ * @control is set, further wait until the kdamond finishes handling of the
+ * request. Otherwise, return as soon as the request is made.
+ *
+ * The kdamond executes the function with the argument in the main loop, just
+ * after a sampling of the iteration is finished. The function can hence
+ * safely access the internal data of the &struct damon_ctx without additional
+ * synchronization. The return value of the function will be saved in
+ * &damon_call_control->return_code.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
+{
+ if (!control->repeat)
+ init_completion(&control->completion);
+ control->canceled = false;
+ INIT_LIST_HEAD(&control->list);
+
+ mutex_lock(&ctx->call_controls_lock);
+ list_add_tail(&control->list, &ctx->call_controls);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!damon_is_running(ctx))
+ return -EINVAL;
+ if (control->repeat)
+ return 0;
+ wait_for_completion(&control->completion);
+ if (control->canceled)
+ return -ECANCELED;
+ return 0;
+}
+
+/**
+ * damos_walk() - Invoke a given functions while DAMOS walk regions.
+ * @ctx: DAMON context to call the functions for.
+ * @control: Control variable of the walk request.
+ *
+ * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region
+ * that the kdamond will apply DAMOS action to, and wait until the kdamond
+ * finishes handling of the request.
+ *
+ * The kdamond executes the given function in the main loop, for each region
+ * just after it applied any DAMOS actions of @ctx to it. The invocation is
+ * made only within one &damos->apply_interval_us since damos_walk()
+ * invocation, for each scheme. The given callback function can hence safely
+ * access the internal data of &struct damon_ctx and &struct damon_region that
+ * each of the scheme will apply the action for next interval, without
+ * additional synchronizations against the kdamond. If every scheme of @ctx
+ * passed at least one &damos->apply_interval_us, kdamond marks the request as
+ * completed so that damos_walk() can wakeup and return.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
+{
+ init_completion(&control->completion);
+ control->canceled = false;
+ mutex_lock(&ctx->walk_control_lock);
+ if (ctx->walk_control) {
+ mutex_unlock(&ctx->walk_control_lock);
+ return -EBUSY;
+ }
+ ctx->walk_control = control;
+ mutex_unlock(&ctx->walk_control_lock);
+ if (!damon_is_running(ctx))
+ return -EINVAL;
+ wait_for_completion(&control->completion);
+ if (control->canceled)
+ return -ECANCELED;
+ return 0;
+}
+
+/*
+ * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing
+ * the problem being propagated.
+ */
+static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r)
+{
+ if (r->nr_accesses_bp == r->nr_accesses * 10000)
+ return;
+ WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n",
+ r->nr_accesses_bp, r->nr_accesses);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+}
+
/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
@@ -1175,6 +1503,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
damon_for_each_region(r, t) {
trace_damon_aggregated(ti, r, damon_nr_regions(t));
+ damon_warn_fix_nr_accesses_corruption(r);
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
@@ -1182,6 +1511,67 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
}
}
+static unsigned long damon_get_intervals_score(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long sz_region, max_access_events = 0, access_events = 0;
+ unsigned long target_access_events;
+ unsigned long goal_bp = c->attrs.intervals_goal.access_bp;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ sz_region = damon_sz_region(r);
+ max_access_events += sz_region * c->attrs.aggr_samples;
+ access_events += sz_region * r->nr_accesses;
+ }
+ }
+ target_access_events = max_access_events * goal_bp / 10000;
+ target_access_events = target_access_events ? : 1;
+ return access_events * 10000 / target_access_events;
+}
+
+static unsigned long damon_feed_loop_next_input(unsigned long last_input,
+ unsigned long score);
+
+static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c)
+{
+ unsigned long score_bp, adaptation_bp;
+
+ score_bp = damon_get_intervals_score(c);
+ adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) /
+ 10000;
+ /*
+ * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of
+ * the intervals by rescaling [1,10,000] to [5000, 10,000].
+ */
+ if (adaptation_bp <= 10000)
+ adaptation_bp = 5000 + adaptation_bp / 2;
+ return adaptation_bp;
+}
+
+static void kdamond_tune_intervals(struct damon_ctx *c)
+{
+ unsigned long adaptation_bp;
+ struct damon_attrs new_attrs;
+ struct damon_intervals_goal *goal;
+
+ adaptation_bp = damon_get_intervals_adaptation_bp(c);
+ if (adaptation_bp == 10000)
+ return;
+
+ new_attrs = c->attrs;
+ goal = &c->attrs.intervals_goal;
+ new_attrs.sample_interval = min(goal->max_sample_us,
+ c->attrs.sample_interval * adaptation_bp / 10000);
+ new_attrs.sample_interval = max(goal->min_sample_us,
+ new_attrs.sample_interval);
+ new_attrs.aggr_interval = new_attrs.sample_interval *
+ c->attrs.aggr_samples;
+ trace_damon_monitor_intervals_tune(new_attrs.sample_interval);
+ damon_set_attrs(c, &new_attrs);
+}
+
static void damon_split_region_at(struct damon_target *t,
struct damon_region *r, unsigned long sz_r);
@@ -1272,16 +1662,18 @@ static bool damos_skip_charged_region(struct damon_target *t,
}
static void damos_update_stat(struct damos *s,
- unsigned long sz_tried, unsigned long sz_applied)
+ unsigned long sz_tried, unsigned long sz_applied,
+ unsigned long sz_ops_filter_passed)
{
s->stat.nr_tried++;
s->stat.sz_tried += sz_tried;
if (sz_applied)
s->stat.nr_applied++;
s->stat.sz_applied += sz_applied;
+ s->stat.sz_ops_filter_passed += sz_ops_filter_passed;
}
-static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
struct damon_region *r, struct damos_filter *filter)
{
bool matched = false;
@@ -1334,11 +1726,102 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_filter *filter;
+ s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (__damos_filter_out(ctx, t, r, filter))
- return true;
+ if (damos_filter_match(ctx, t, r, filter)) {
+ if (filter->allow)
+ s->core_filters_allowed = true;
+ return !filter->allow;
+ }
}
- return false;
+ return s->core_filters_default_reject;
+}
+
+/*
+ * damos_walk_call_walk() - Call &damos_walk_control->walk_fn.
+ * @ctx: The context of &damon_ctx->walk_control.
+ * @t: The monitoring target of @r that @s will be applied.
+ * @r: The region of @t that @s will be applied.
+ * @s: The scheme of @ctx that will be applied to @r.
+ *
+ * This function is called from kdamond whenever it asked the operation set to
+ * apply a DAMOS scheme action to a region. If a DAMOS walk request is
+ * installed by damos_walk() and not yet uninstalled, invoke it.
+ */
+static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s,
+ unsigned long sz_filter_passed)
+{
+ struct damos_walk_control *control;
+
+ if (s->walk_completed)
+ return;
+
+ control = ctx->walk_control;
+ if (!control)
+ return;
+
+ control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed);
+}
+
+/*
+ * damos_walk_complete() - Complete DAMOS walk request if all walks are done.
+ * @ctx: The context of &damon_ctx->walk_control.
+ * @s: A scheme of @ctx that all walks are now done.
+ *
+ * This function is called when kdamond finished applying the action of a DAMOS
+ * scheme to all regions that eligible for the given &damos->apply_interval_us.
+ * If every scheme of @ctx including @s now finished walking for at least one
+ * &damos->apply_interval_us, this function makrs the handling of the given
+ * DAMOS walk request is done, so that damos_walk() can wake up and return.
+ */
+static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s)
+{
+ struct damos *siter;
+ struct damos_walk_control *control;
+
+ control = ctx->walk_control;
+ if (!control)
+ return;
+
+ s->walk_completed = true;
+ /* if all schemes completed, signal completion to walker */
+ damon_for_each_scheme(siter, ctx) {
+ if (!siter->walk_completed)
+ return;
+ }
+ damon_for_each_scheme(siter, ctx)
+ siter->walk_completed = false;
+
+ complete(&control->completion);
+ ctx->walk_control = NULL;
+}
+
+/*
+ * damos_walk_cancel() - Cancel the current DAMOS walk request.
+ * @ctx: The context of &damon_ctx->walk_control.
+ *
+ * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS
+ * walk is requested but there is no DAMOS scheme to walk for, or the kdamond
+ * is already out of the main loop and therefore gonna be terminated, and hence
+ * cannot continue the walks. This function therefore marks the walk request
+ * as canceled, so that damos_walk() can wake up and return.
+ */
+static void damos_walk_cancel(struct damon_ctx *ctx)
+{
+ struct damos_walk_control *control;
+
+ mutex_lock(&ctx->walk_control_lock);
+ control = ctx->walk_control;
+ mutex_unlock(&ctx->walk_control_lock);
+
+ if (!control)
+ return;
+ control->canceled = true;
+ complete(&control->completion);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control = NULL;
+ mutex_unlock(&ctx->walk_control_lock);
}
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
@@ -1348,7 +1831,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
- int err = 0;
+ unsigned long sz_ops_filter_passed = 0;
/*
* We plan to support multiple context per kdamond, as DAMON sysfs
* implies with 'nr_contexts' file. Nevertheless, only single context
@@ -1388,13 +1871,11 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (damos_filter_out(c, t, r, s))
return;
ktime_get_coarse_ts64(&begin);
- if (c->callback.before_damos_apply)
- err = c->callback.before_damos_apply(c, t, r, s);
- if (!err) {
- trace_damos_before_apply(cidx, sidx, tidx, r,
- damon_nr_regions(t), do_trace);
- sz_applied = c->ops.apply_scheme(c, t, r, s);
- }
+ trace_damos_before_apply(cidx, sidx, tidx, r,
+ damon_nr_regions(t), do_trace);
+ sz_applied = c->ops.apply_scheme(c, t, r, s,
+ &sz_ops_filter_passed);
+ damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
@@ -1408,7 +1889,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
r->age = 0;
update_stat:
- damos_update_stat(s, sz, sz_applied);
+ damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed);
}
static void damon_do_apply_schemes(struct damon_ctx *c,
@@ -1510,6 +1991,29 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
+#ifdef CONFIG_NUMA
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ struct sysinfo i;
+ __kernel_ulong_t numerator;
+
+ si_meminfo_node(&i, goal->nid);
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ numerator = i.totalram - i.freeram;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ numerator = i.freeram;
+ return numerator * 10000 / i.totalram;
+}
+#else
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ return 0;
+}
+#endif
+
+
static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -1523,6 +2027,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = now_psi_total - goal->last_psi_total;
goal->last_psi_total = now_psi_total;
break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->current_value = damos_get_node_mem_bp(goal);
+ break;
default:
break;
}
@@ -1550,7 +2058,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
static void damos_set_effective_quota(struct damos_quota *quota)
{
unsigned long throughput;
- unsigned long esz;
+ unsigned long esz = ULONG_MAX;
if (!quota->ms && list_empty(&quota->goals)) {
quota->esz = quota->sz;
@@ -1568,14 +2076,11 @@ static void damos_set_effective_quota(struct damos_quota *quota)
if (quota->ms) {
if (quota->total_charged_ns)
- throughput = quota->total_charged_sz * 1000000 /
- quota->total_charged_ns;
+ throughput = mult_frac(quota->total_charged_sz, 1000000,
+ quota->total_charged_ns);
else
throughput = PAGE_SIZE * 1024;
- if (!list_empty(&quota->goals))
- esz = min(throughput * quota->ms, esz);
- else
- esz = throughput * quota->ms;
+ esz = min(throughput * quota->ms, esz);
}
if (quota->sz && quota->sz < esz)
@@ -1584,17 +2089,35 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->esz = esz;
}
+static void damos_trace_esz(struct damon_ctx *c, struct damos *s,
+ struct damos_quota *quota)
+{
+ unsigned int cidx = 0, sidx = 0;
+ struct damos *siter;
+
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ trace_damos_esz(cidx, sidx, quota->esz);
+}
+
static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
{
struct damos_quota *quota = &s->quota;
struct damon_target *t;
struct damon_region *r;
- unsigned long cumulated_sz;
+ unsigned long cumulated_sz, cached_esz;
unsigned int score, max_score = 0;
if (!quota->ms && !quota->sz && list_empty(&quota->goals))
return;
+ /* First charge window */
+ if (!quota->total_charged_sz && !quota->charged_from)
+ quota->charged_from = jiffies;
+
/* New charge window starts */
if (time_after_eq(jiffies, quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
@@ -1603,7 +2126,11 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
+ if (trace_damos_esz_enabled())
+ cached_esz = quota->esz;
damos_set_effective_quota(quota);
+ if (trace_damos_esz_enabled() && quota->esz != cached_esz)
+ damos_trace_esz(c, s, quota);
}
if (!c->ops.get_scheme_score)
@@ -1658,6 +2185,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (!has_schemes_to_apply)
return;
+ mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
@@ -1666,10 +2194,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
+ damos_walk_complete(c, s);
s->next_apply_sis = c->passed_sample_intervals +
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
+ s->last_applied = NULL;
}
+ mutex_unlock(&c->walk_control_lock);
}
/*
@@ -1894,9 +2425,8 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
if (scheme->wmarks.activated)
pr_debug("deactivate a scheme (%d) for %s wmark\n",
- scheme->action,
- metric > scheme->wmarks.high ?
- "high" : "low");
+ scheme->action,
+ str_high_low(metric > scheme->wmarks.high));
scheme->wmarks.activated = false;
return scheme->wmarks.interval;
}
@@ -1920,6 +2450,56 @@ static void kdamond_usleep(unsigned long usecs)
usleep_range_idle(usecs, usecs + 1);
}
+/*
+ * kdamond_call() - handle damon_call_control objects.
+ * @ctx: The &struct damon_ctx of the kdamond.
+ * @cancel: Whether to cancel the invocation of the function.
+ *
+ * If there are &struct damon_call_control requests that registered via
+ * &damon_call() on @ctx, do or cancel the invocation of the function depending
+ * on @cancel. @cancel is set when the kdamond is already out of the main loop
+ * and therefore will be terminated.
+ */
+static void kdamond_call(struct damon_ctx *ctx, bool cancel)
+{
+ struct damon_call_control *control;
+ LIST_HEAD(repeat_controls);
+ int ret = 0;
+
+ while (true) {
+ mutex_lock(&ctx->call_controls_lock);
+ control = list_first_entry_or_null(&ctx->call_controls,
+ struct damon_call_control, list);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!control)
+ break;
+ if (cancel) {
+ control->canceled = true;
+ } else {
+ ret = control->fn(control->data);
+ control->return_code = ret;
+ }
+ mutex_lock(&ctx->call_controls_lock);
+ list_del(&control->list);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!control->repeat) {
+ complete(&control->completion);
+ } else if (control->canceled && control->dealloc_on_cancel) {
+ kfree(control);
+ continue;
+ } else {
+ list_add(&control->list, &repeat_controls);
+ }
+ }
+ control = list_first_entry_or_null(&repeat_controls,
+ struct damon_call_control, list);
+ if (!control || cancel)
+ return;
+ mutex_lock(&ctx->call_controls_lock);
+ list_add_tail(&control->list, &ctx->call_controls);
+ mutex_unlock(&ctx->call_controls_lock);
+}
+
/* Returns negative error code if it's not activated but should return */
static int kdamond_wait_activation(struct damon_ctx *ctx)
{
@@ -1941,14 +2521,13 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
kdamond_usleep(min_wait_time);
- if (ctx->callback.after_wmarks_check &&
- ctx->callback.after_wmarks_check(ctx))
- break;
+ kdamond_call(ctx, false);
+ damos_walk_cancel(ctx);
}
return -EBUSY;
}
-static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+static void kdamond_init_ctx(struct damon_ctx *ctx)
{
unsigned long sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
@@ -1959,11 +2538,14 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
sample_interval;
+ ctx->next_intervals_tune_sis = ctx->next_aggregation_sis *
+ ctx->attrs.intervals_goal.aggrs;
damon_for_each_scheme(scheme, ctx) {
apply_interval = scheme->apply_interval_us ?
scheme->apply_interval_us : ctx->attrs.aggr_interval;
scheme->next_apply_sis = apply_interval / sample_interval;
+ damos_set_filters_default_reject(scheme);
}
}
@@ -1981,12 +2563,10 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
complete(&ctx->kdamond_started);
- kdamond_init_intervals_sis(ctx);
+ kdamond_init_ctx(ctx);
if (ctx->ops.init)
ctx->ops.init(ctx);
- if (ctx->callback.before_start && ctx->callback.before_start(ctx))
- goto done;
ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1,
sizeof(*ctx->regions_score_histogram), GFP_KERNEL);
if (!ctx->regions_score_histogram)
@@ -1997,10 +2577,9 @@ static int kdamond_fn(void *data)
while (!kdamond_need_stop(ctx)) {
/*
* ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
- * be changed from after_wmarks_check() or after_aggregation()
- * callbacks. Read the values here, and use those for this
- * iteration. That is, damon_set_attrs() updated new values
- * are respected from next iteration.
+ * be changed from kdamond_call(). Read the values here, and
+ * use those for this iteration. That is, damon_set_attrs()
+ * updated new values are respected from next iteration.
*/
unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
@@ -2011,9 +2590,6 @@ static int kdamond_fn(void *data)
if (ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
- if (ctx->callback.after_sampling &&
- ctx->callback.after_sampling(ctx))
- break;
kdamond_usleep(sample_interval);
ctx->passed_sample_intervals++;
@@ -2021,32 +2597,58 @@ static int kdamond_fn(void *data)
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
- if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+ if (ctx->passed_sample_intervals >= next_aggregation_sis)
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
- if (ctx->callback.after_aggregation &&
- ctx->callback.after_aggregation(ctx))
- break;
- }
/*
- * do kdamond_apply_schemes() after kdamond_merge_regions() if
- * possible, to reduce overhead
+ * do kdamond_call() and kdamond_apply_schemes() after
+ * kdamond_merge_regions() if possible, to reduce overhead
*/
+ kdamond_call(ctx, false);
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
+ else
+ damos_walk_cancel(ctx);
sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+ if (ctx->attrs.intervals_goal.aggrs &&
+ ctx->passed_sample_intervals >=
+ ctx->next_intervals_tune_sis) {
+ /*
+ * ctx->next_aggregation_sis might be updated
+ * from kdamond_call(). In the case,
+ * damon_set_attrs() which will be called from
+ * kdamond_tune_interval() may wrongly think
+ * this is in the middle of the current
+ * aggregation, and make aggregation
+ * information reset for all regions. Then,
+ * following kdamond_reset_aggregated() call
+ * will make the region information invalid,
+ * particularly for ->nr_accesses_bp.
+ *
+ * Reset ->next_aggregation_sis to avoid that.
+ * It will anyway correctly updated after this
+ * if caluse.
+ */
+ ctx->next_aggregation_sis =
+ next_aggregation_sis;
+ ctx->next_intervals_tune_sis +=
+ ctx->attrs.aggr_samples *
+ ctx->attrs.intervals_goal.aggrs;
+ kdamond_tune_intervals(ctx);
+ sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+
+ }
ctx->next_aggregation_sis = next_aggregation_sis +
ctx->attrs.aggr_interval / sample_interval;
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
- if (ctx->ops.reset_aggregated)
- ctx->ops.reset_aggregated(ctx);
}
if (ctx->passed_sample_intervals >= next_ops_update_sis) {
@@ -2064,8 +2666,6 @@ done:
damon_destroy_region(r, t);
}
- if (ctx->callback.before_terminate)
- ctx->callback.before_terminate(ctx);
if (ctx->ops.cleanup)
ctx->ops.cleanup(ctx);
kfree(ctx->regions_score_histogram);
@@ -2075,12 +2675,16 @@ done:
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
+ kdamond_call(ctx, true);
+ damos_walk_cancel(ctx);
+
mutex_lock(&damon_lock);
nr_running_ctxs--;
if (!nr_running_ctxs && running_exclusive_ctxs)
running_exclusive_ctxs = false;
mutex_unlock(&damon_lock);
+ damon_destroy_targets(ctx);
return 0;
}
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
deleted file mode 100644
index b4213bc47e44..000000000000
--- a/mm/damon/dbgfs.c
+++ /dev/null
@@ -1,1148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DAMON Debugfs Interface
- *
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#define pr_fmt(fmt) "damon-dbgfs: " fmt
-
-#include <linux/damon.h>
-#include <linux/debugfs.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/page_idle.h>
-#include <linux/slab.h>
-
-#define DAMON_DBGFS_DEPRECATION_NOTICE \
- "DAMON debugfs interface is deprecated, so users should move " \
- "to DAMON_SYSFS. If you cannot, please report your usecase to " \
- "damon@lists.linux.dev and linux-mm@kvack.org.\n"
-
-static struct damon_ctx **dbgfs_ctxs;
-static int dbgfs_nr_ctxs;
-static struct dentry **dbgfs_dirs;
-static DEFINE_MUTEX(damon_dbgfs_lock);
-
-static void damon_dbgfs_warn_deprecation(void)
-{
- pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
-}
-
-/*
- * Returns non-empty string on success, negative error code otherwise.
- */
-static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- ssize_t ret;
-
- /* We do not accept continuous write */
- if (*ppos)
- return ERR_PTR(-EINVAL);
-
- kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return ERR_PTR(-ENOMEM);
-
- ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count);
- if (ret != count) {
- kfree(kbuf);
- return ERR_PTR(-EIO);
- }
- kbuf[ret] = '\0';
-
- return kbuf;
-}
-
-static ssize_t dbgfs_attrs_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char kbuf[128];
- int ret;
-
- mutex_lock(&ctx->kdamond_lock);
- ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
- ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
- ctx->attrs.ops_update_interval,
- ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
- mutex_unlock(&ctx->kdamond_lock);
-
- return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
-}
-
-static ssize_t dbgfs_attrs_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- struct damon_attrs attrs;
- char *kbuf;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
- &attrs.sample_interval, &attrs.aggr_interval,
- &attrs.ops_update_interval,
- &attrs.min_nr_regions,
- &attrs.max_nr_regions) != 5) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- ret = damon_set_attrs(ctx, &attrs);
- if (!ret)
- ret = count;
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
-out:
- kfree(kbuf);
- return ret;
-}
-
-/*
- * Return corresponding dbgfs' scheme action value (int) for the given
- * damos_action if the given damos_action value is valid and supported by
- * dbgfs, negative error code otherwise.
- */
-static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
-{
- switch (action) {
- case DAMOS_WILLNEED:
- return 0;
- case DAMOS_COLD:
- return 1;
- case DAMOS_PAGEOUT:
- return 2;
- case DAMOS_HUGEPAGE:
- return 3;
- case DAMOS_NOHUGEPAGE:
- return 4;
- case DAMOS_STAT:
- return 5;
- default:
- return -EINVAL;
- }
-}
-
-static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
-{
- struct damos *s;
- int written = 0;
- int rc;
-
- damon_for_each_scheme(s, c) {
- rc = scnprintf(&buf[written], len - written,
- "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
- s->pattern.min_sz_region,
- s->pattern.max_sz_region,
- s->pattern.min_nr_accesses,
- s->pattern.max_nr_accesses,
- s->pattern.min_age_region,
- s->pattern.max_age_region,
- damos_action_to_dbgfs_scheme_action(s->action),
- s->quota.ms, s->quota.sz,
- s->quota.reset_interval,
- s->quota.weight_sz,
- s->quota.weight_nr_accesses,
- s->quota.weight_age,
- s->wmarks.metric, s->wmarks.interval,
- s->wmarks.high, s->wmarks.mid, s->wmarks.low,
- s->stat.nr_tried, s->stat.sz_tried,
- s->stat.nr_applied, s->stat.sz_applied,
- s->stat.qt_exceeds);
- if (!rc)
- return -ENOMEM;
-
- written += rc;
- }
- return written;
-}
-
-static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- len = sprint_schemes(ctx, kbuf, count);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
-{
- ssize_t i;
-
- for (i = 0; i < nr_schemes; i++)
- kfree(schemes[i]);
- kfree(schemes);
-}
-
-/*
- * Return corresponding damos_action for the given dbgfs input for a scheme
- * action if the input is valid, negative error code otherwise.
- */
-static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
-{
- switch (dbgfs_action) {
- case 0:
- return DAMOS_WILLNEED;
- case 1:
- return DAMOS_COLD;
- case 2:
- return DAMOS_PAGEOUT;
- case 3:
- return DAMOS_HUGEPAGE;
- case 4:
- return DAMOS_NOHUGEPAGE;
- case 5:
- return DAMOS_STAT;
- default:
- return -EINVAL;
- }
-}
-
-/*
- * Converts a string into an array of struct damos pointers
- *
- * Returns an array of struct damos pointers that converted if the conversion
- * success, or NULL otherwise.
- */
-static struct damos **str_to_schemes(const char *str, ssize_t len,
- ssize_t *nr_schemes)
-{
- struct damos *scheme, **schemes;
- const int max_nr_schemes = 256;
- int pos = 0, parsed, ret;
- unsigned int action_input;
- enum damos_action action;
-
- schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
- GFP_KERNEL);
- if (!schemes)
- return NULL;
-
- *nr_schemes = 0;
- while (pos < len && *nr_schemes < max_nr_schemes) {
- struct damos_access_pattern pattern = {};
- struct damos_quota quota = {};
- struct damos_watermarks wmarks;
-
- ret = sscanf(&str[pos],
- "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
- &pattern.min_sz_region, &pattern.max_sz_region,
- &pattern.min_nr_accesses,
- &pattern.max_nr_accesses,
- &pattern.min_age_region,
- &pattern.max_age_region,
- &action_input, &quota.ms,
- &quota.sz, &quota.reset_interval,
- &quota.weight_sz, &quota.weight_nr_accesses,
- &quota.weight_age, &wmarks.metric,
- &wmarks.interval, &wmarks.high, &wmarks.mid,
- &wmarks.low, &parsed);
- if (ret != 18)
- break;
- action = dbgfs_scheme_action_to_damos_action(action_input);
- if ((int)action < 0)
- goto fail;
-
- if (pattern.min_sz_region > pattern.max_sz_region ||
- pattern.min_nr_accesses > pattern.max_nr_accesses ||
- pattern.min_age_region > pattern.max_age_region)
- goto fail;
-
- if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
- wmarks.mid < wmarks.low)
- goto fail;
-
- pos += parsed;
- scheme = damon_new_scheme(&pattern, action, 0, &quota,
- &wmarks, NUMA_NO_NODE);
- if (!scheme)
- goto fail;
-
- schemes[*nr_schemes] = scheme;
- *nr_schemes += 1;
- }
- return schemes;
-fail:
- free_schemes_arr(schemes, *nr_schemes);
- return NULL;
-}
-
-static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- struct damos **schemes;
- ssize_t nr_schemes = 0, ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- schemes = str_to_schemes(kbuf, count, &nr_schemes);
- if (!schemes) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- damon_set_schemes(ctx, schemes, nr_schemes);
- ret = count;
- nr_schemes = 0;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- free_schemes_arr(schemes, nr_schemes);
-out:
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
-{
- struct damon_target *t;
- int id;
- int written = 0;
- int rc;
-
- damon_for_each_target(t, ctx) {
- if (damon_target_has_pid(ctx))
- /* Show pid numbers to debugfs users */
- id = pid_vnr(t->pid);
- else
- /* Show 42 for physical address space, just for fun */
- id = 42;
-
- rc = scnprintf(&buf[written], len - written, "%d ", id);
- if (!rc)
- return -ENOMEM;
- written += rc;
- }
- if (written)
- written -= 1;
- written += scnprintf(&buf[written], len - written, "\n");
- return written;
-}
-
-static ssize_t dbgfs_target_ids_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- ssize_t len;
- char ids_buf[320];
-
- mutex_lock(&ctx->kdamond_lock);
- len = sprint_target_ids(ctx, ids_buf, 320);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- return len;
-
- return simple_read_from_buffer(buf, count, ppos, ids_buf, len);
-}
-
-/*
- * Converts a string into an integers array
- *
- * Returns an array of integers array if the conversion success, or NULL
- * otherwise.
- */
-static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
-{
- int *array;
- const int max_nr_ints = 32;
- int nr;
- int pos = 0, parsed, ret;
-
- *nr_ints = 0;
- array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
- if (!array)
- return NULL;
- while (*nr_ints < max_nr_ints && pos < len) {
- ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
- pos += parsed;
- if (ret != 1)
- break;
- array[*nr_ints] = nr;
- *nr_ints += 1;
- }
-
- return array;
-}
-
-static void dbgfs_put_pids(struct pid **pids, int nr_pids)
-{
- int i;
-
- for (i = 0; i < nr_pids; i++)
- put_pid(pids[i]);
-}
-
-/*
- * Converts a string into an struct pid pointers array
- *
- * Returns an array of struct pid pointers if the conversion success, or NULL
- * otherwise.
- */
-static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
-{
- int *ints;
- ssize_t nr_ints;
- struct pid **pids;
-
- *nr_pids = 0;
-
- ints = str_to_ints(str, len, &nr_ints);
- if (!ints)
- return NULL;
-
- pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
- if (!pids)
- goto out;
-
- for (; *nr_pids < nr_ints; (*nr_pids)++) {
- pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
- if (!pids[*nr_pids]) {
- dbgfs_put_pids(pids, *nr_pids);
- kfree(ints);
- kfree(pids);
- return NULL;
- }
- }
-
-out:
- kfree(ints);
- return pids;
-}
-
-/*
- * dbgfs_set_targets() - Set monitoring targets.
- * @ctx: monitoring context
- * @nr_targets: number of targets
- * @pids: array of target pids (size is same to @nr_targets)
- *
- * This function should not be called while the kdamond is running. @pids is
- * ignored if the context is not configured to have pid in each target. On
- * failure, reference counts of all pids in @pids are decremented.
- *
- * Return: 0 on success, negative error code otherwise.
- */
-static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
- struct pid **pids)
-{
- ssize_t i;
- struct damon_target *t, *next;
-
- damon_for_each_target_safe(t, next, ctx) {
- if (damon_target_has_pid(ctx))
- put_pid(t->pid);
- damon_destroy_target(t);
- }
-
- for (i = 0; i < nr_targets; i++) {
- t = damon_new_target();
- if (!t) {
- damon_for_each_target_safe(t, next, ctx)
- damon_destroy_target(t);
- if (damon_target_has_pid(ctx))
- dbgfs_put_pids(pids, nr_targets);
- return -ENOMEM;
- }
- if (damon_target_has_pid(ctx))
- t->pid = pids[i];
- damon_add_target(ctx, t);
- }
-
- return 0;
-}
-
-static ssize_t dbgfs_target_ids_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- bool id_is_pid = true;
- char *kbuf;
- struct pid **target_pids = NULL;
- ssize_t nr_targets;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- if (!strncmp(kbuf, "paddr\n", count)) {
- id_is_pid = false;
- nr_targets = 1;
- }
-
- if (id_is_pid) {
- target_pids = str_to_pids(kbuf, count, &nr_targets);
- if (!target_pids) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- if (id_is_pid)
- dbgfs_put_pids(target_pids, nr_targets);
- ret = -EBUSY;
- goto unlock_out;
- }
-
- /* remove previously set targets */
- dbgfs_set_targets(ctx, 0, NULL);
- if (!nr_targets) {
- ret = count;
- goto unlock_out;
- }
-
- /* Configure the context for the address space type */
- if (id_is_pid)
- ret = damon_select_ops(ctx, DAMON_OPS_VADDR);
- else
- ret = damon_select_ops(ctx, DAMON_OPS_PADDR);
- if (ret)
- goto unlock_out;
-
- ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
- if (!ret)
- ret = count;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- kfree(target_pids);
-out:
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
-{
- struct damon_target *t;
- struct damon_region *r;
- int target_idx = 0;
- int written = 0;
- int rc;
-
- damon_for_each_target(t, c) {
- damon_for_each_region(r, t) {
- rc = scnprintf(&buf[written], len - written,
- "%d %lu %lu\n",
- target_idx, r->ar.start, r->ar.end);
- if (!rc)
- return -ENOMEM;
- written += rc;
- }
- target_idx++;
- }
- return written;
-}
-
-static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- mutex_unlock(&ctx->kdamond_lock);
- len = -EBUSY;
- goto out;
- }
-
- len = sprint_init_regions(ctx, kbuf, count);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static int add_init_region(struct damon_ctx *c, int target_idx,
- struct damon_addr_range *ar)
-{
- struct damon_target *t;
- struct damon_region *r, *prev;
- unsigned long idx = 0;
- int rc = -EINVAL;
-
- if (ar->start >= ar->end)
- return -EINVAL;
-
- damon_for_each_target(t, c) {
- if (idx++ == target_idx) {
- r = damon_new_region(ar->start, ar->end);
- if (!r)
- return -ENOMEM;
- damon_add_region(r, t);
- if (damon_nr_regions(t) > 1) {
- prev = damon_prev_region(r);
- if (prev->ar.end > r->ar.start) {
- damon_destroy_region(r, t);
- return -EINVAL;
- }
- }
- rc = 0;
- }
- }
- return rc;
-}
-
-static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
-{
- struct damon_target *t;
- struct damon_region *r, *next;
- int pos = 0, parsed, ret;
- int target_idx;
- struct damon_addr_range ar;
- int err;
-
- damon_for_each_target(t, c) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
-
- while (pos < len) {
- ret = sscanf(&str[pos], "%d %lu %lu%n",
- &target_idx, &ar.start, &ar.end, &parsed);
- if (ret != 3)
- break;
- err = add_init_region(c, target_idx, &ar);
- if (err)
- goto fail;
- pos += parsed;
- }
-
- return 0;
-
-fail:
- damon_for_each_target(t, c) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
- return err;
-}
-
-static ssize_t dbgfs_init_regions_write(struct file *file,
- const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t ret = count;
- int err;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- err = set_init_regions(ctx, kbuf, ret);
- if (err)
- ret = err;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t dbgfs_kdamond_pid_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond)
- len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid);
- else
- len = scnprintf(kbuf, count, "none\n");
- mutex_unlock(&ctx->kdamond_lock);
- if (!len)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static int damon_dbgfs_open(struct inode *inode, struct file *file)
-{
- damon_dbgfs_warn_deprecation();
-
- file->private_data = inode->i_private;
-
- return nonseekable_open(inode, file);
-}
-
-static const struct file_operations attrs_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_attrs_read,
- .write = dbgfs_attrs_write,
-};
-
-static const struct file_operations schemes_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_schemes_read,
- .write = dbgfs_schemes_write,
-};
-
-static const struct file_operations target_ids_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_target_ids_read,
- .write = dbgfs_target_ids_write,
-};
-
-static const struct file_operations init_regions_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_init_regions_read,
- .write = dbgfs_init_regions_write,
-};
-
-static const struct file_operations kdamond_pid_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_kdamond_pid_read,
-};
-
-static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
-{
- const char * const file_names[] = {"attrs", "schemes", "target_ids",
- "init_regions", "kdamond_pid"};
- const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
- &target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
- int i;
-
- for (i = 0; i < ARRAY_SIZE(file_names); i++)
- debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
-}
-
-static void dbgfs_before_terminate(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
-
- if (!damon_target_has_pid(ctx))
- return;
-
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_target_safe(t, next, ctx) {
- put_pid(t->pid);
- damon_destroy_target(t);
- }
- mutex_unlock(&ctx->kdamond_lock);
-}
-
-static struct damon_ctx *dbgfs_new_ctx(void)
-{
- struct damon_ctx *ctx;
-
- ctx = damon_new_ctx();
- if (!ctx)
- return NULL;
-
- if (damon_select_ops(ctx, DAMON_OPS_VADDR) &&
- damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return NULL;
- }
- ctx->callback.before_terminate = dbgfs_before_terminate;
- return ctx;
-}
-
-static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
-{
- damon_destroy_ctx(ctx);
-}
-
-static ssize_t damon_dbgfs_deprecated_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
-
- return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
-}
-
-/*
- * Make a context of @name and create a debugfs directory for it.
- *
- * This function should be called while holding damon_dbgfs_lock.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int dbgfs_mk_context(char *name)
-{
- struct dentry *root, **new_dirs, *new_dir;
- struct damon_ctx **new_ctxs, *new_ctx;
-
- if (damon_nr_running_ctxs())
- return -EBUSY;
-
- new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) *
- (dbgfs_nr_ctxs + 1), GFP_KERNEL);
- if (!new_ctxs)
- return -ENOMEM;
- dbgfs_ctxs = new_ctxs;
-
- new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) *
- (dbgfs_nr_ctxs + 1), GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
- dbgfs_dirs = new_dirs;
-
- root = dbgfs_dirs[0];
- if (!root)
- return -ENOENT;
-
- new_dir = debugfs_create_dir(name, root);
- /* Below check is required for a potential duplicated name case */
- if (IS_ERR(new_dir))
- return PTR_ERR(new_dir);
- dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
-
- new_ctx = dbgfs_new_ctx();
- if (!new_ctx) {
- debugfs_remove(new_dir);
- dbgfs_dirs[dbgfs_nr_ctxs] = NULL;
- return -ENOMEM;
- }
-
- dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx;
- dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs],
- dbgfs_ctxs[dbgfs_nr_ctxs]);
- dbgfs_nr_ctxs++;
-
- return 0;
-}
-
-static ssize_t dbgfs_mk_context_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- char *ctx_name;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- ctx_name = kmalloc(count + 1, GFP_KERNEL);
- if (!ctx_name) {
- kfree(kbuf);
- return -ENOMEM;
- }
-
- /* Trim white space */
- if (sscanf(kbuf, "%s", ctx_name) != 1) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- ret = dbgfs_mk_context(ctx_name);
- if (!ret)
- ret = count;
- mutex_unlock(&damon_dbgfs_lock);
-
-out:
- kfree(kbuf);
- kfree(ctx_name);
- return ret;
-}
-
-/*
- * Remove a context of @name and its debugfs directory.
- *
- * This function should be called while holding damon_dbgfs_lock.
- *
- * Return 0 on success, negative error code otherwise.
- */
-static int dbgfs_rm_context(char *name)
-{
- struct dentry *root, *dir, **new_dirs;
- struct inode *inode;
- struct damon_ctx **new_ctxs;
- int i, j;
- int ret = 0;
-
- if (damon_nr_running_ctxs())
- return -EBUSY;
-
- root = dbgfs_dirs[0];
- if (!root)
- return -ENOENT;
-
- dir = debugfs_lookup(name, root);
- if (!dir)
- return -ENOENT;
-
- inode = d_inode(dir);
- if (!S_ISDIR(inode->i_mode)) {
- ret = -EINVAL;
- goto out_dput;
- }
-
- new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
- GFP_KERNEL);
- if (!new_dirs) {
- ret = -ENOMEM;
- goto out_dput;
- }
-
- new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
- GFP_KERNEL);
- if (!new_ctxs) {
- ret = -ENOMEM;
- goto out_new_dirs;
- }
-
- for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
- if (dbgfs_dirs[i] == dir) {
- debugfs_remove(dbgfs_dirs[i]);
- dbgfs_destroy_ctx(dbgfs_ctxs[i]);
- continue;
- }
- new_dirs[j] = dbgfs_dirs[i];
- new_ctxs[j++] = dbgfs_ctxs[i];
- }
-
- kfree(dbgfs_dirs);
- kfree(dbgfs_ctxs);
-
- dbgfs_dirs = new_dirs;
- dbgfs_ctxs = new_ctxs;
- dbgfs_nr_ctxs--;
-
- goto out_dput;
-
-out_new_dirs:
- kfree(new_dirs);
-out_dput:
- dput(dir);
- return ret;
-}
-
-static ssize_t dbgfs_rm_context_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- ssize_t ret;
- char *ctx_name;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- ctx_name = kmalloc(count + 1, GFP_KERNEL);
- if (!ctx_name) {
- kfree(kbuf);
- return -ENOMEM;
- }
-
- /* Trim white space */
- if (sscanf(kbuf, "%s", ctx_name) != 1) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- ret = dbgfs_rm_context(ctx_name);
- if (!ret)
- ret = count;
- mutex_unlock(&damon_dbgfs_lock);
-
-out:
- kfree(kbuf);
- kfree(ctx_name);
- return ret;
-}
-
-static ssize_t dbgfs_monitor_on_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- char monitor_on_buf[5];
- bool monitor_on = damon_nr_running_ctxs() != 0;
- int len;
-
- len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n");
-
- return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len);
-}
-
-static ssize_t dbgfs_monitor_on_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- ssize_t ret;
- char *kbuf;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- /* Remove white space */
- if (sscanf(kbuf, "%s", kbuf) != 1) {
- kfree(kbuf);
- return -EINVAL;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- if (!strncmp(kbuf, "on", count)) {
- int i;
-
- for (i = 0; i < dbgfs_nr_ctxs; i++) {
- if (damon_targets_empty(dbgfs_ctxs[i])) {
- kfree(kbuf);
- mutex_unlock(&damon_dbgfs_lock);
- return -EINVAL;
- }
- }
- ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true);
- } else if (!strncmp(kbuf, "off", count)) {
- ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
- } else {
- ret = -EINVAL;
- }
- mutex_unlock(&damon_dbgfs_lock);
-
- if (!ret)
- ret = count;
- kfree(kbuf);
- return ret;
-}
-
-static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
-{
- damon_dbgfs_warn_deprecation();
- return nonseekable_open(inode, file);
-}
-
-static const struct file_operations deprecated_fops = {
- .read = damon_dbgfs_deprecated_read,
-};
-
-static const struct file_operations mk_contexts_fops = {
- .open = damon_dbgfs_static_file_open,
- .write = dbgfs_mk_context_write,
-};
-
-static const struct file_operations rm_contexts_fops = {
- .open = damon_dbgfs_static_file_open,
- .write = dbgfs_rm_context_write,
-};
-
-static const struct file_operations monitor_on_fops = {
- .open = damon_dbgfs_static_file_open,
- .read = dbgfs_monitor_on_read,
- .write = dbgfs_monitor_on_write,
-};
-
-static int __init __damon_dbgfs_init(void)
-{
- struct dentry *dbgfs_root;
- const char * const file_names[] = {"mk_contexts", "rm_contexts",
- "monitor_on_DEPRECATED", "DEPRECATED"};
- const struct file_operations *fops[] = {&mk_contexts_fops,
- &rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
- int i;
-
- dbgfs_root = debugfs_create_dir("damon", NULL);
-
- for (i = 0; i < ARRAY_SIZE(file_names); i++)
- debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL,
- fops[i]);
- dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
-
- dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
- if (!dbgfs_dirs) {
- debugfs_remove(dbgfs_root);
- return -ENOMEM;
- }
- dbgfs_dirs[0] = dbgfs_root;
-
- return 0;
-}
-
-/*
- * Functions for the initialization
- */
-
-static int __init damon_dbgfs_init(void)
-{
- int rc = -ENOMEM;
-
- mutex_lock(&damon_dbgfs_lock);
- dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL);
- if (!dbgfs_ctxs)
- goto out;
- dbgfs_ctxs[0] = dbgfs_new_ctx();
- if (!dbgfs_ctxs[0]) {
- kfree(dbgfs_ctxs);
- goto out;
- }
- dbgfs_nr_ctxs = 1;
-
- rc = __damon_dbgfs_init();
- if (rc) {
- kfree(dbgfs_ctxs[0]);
- kfree(dbgfs_ctxs);
- pr_err("%s: dbgfs init failed\n", __func__);
- }
-
-out:
- mutex_unlock(&damon_dbgfs_lock);
- return rc;
-}
-
-module_init(damon_dbgfs_init);
-
-#include "tests/dbgfs-kunit.h"
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 4af8fd4a390b..26d6a2adb9cd 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -198,7 +198,12 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
return err;
- err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
+ if (!damon_lru_sort_mon_attrs.sample_interval) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs);
if (err)
goto out;
@@ -230,6 +235,39 @@ out:
return err;
}
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_lru_sort_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_lru_sort_damon_call_fn(void *arg)
+{
+ struct damon_ctx *c = arg;
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ if (s->action == DAMOS_LRU_PRIO)
+ damon_lru_sort_hot_stat = s->stat;
+ else if (s->action == DAMOS_LRU_DEPRIO)
+ damon_lru_sort_cold_stat = s->stat;
+ }
+
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_lru_sort_damon_call_fn,
+ .repeat = true,
+};
+
static int damon_lru_sort_turn(bool on)
{
int err;
@@ -249,7 +287,7 @@ static int damon_lru_sort_turn(bool on)
if (err)
return err;
kdamond_pid = ctx->kdamond->pid;
- return 0;
+ return damon_call(ctx, &call_control);
}
static int damon_lru_sort_enabled_store(const char *val,
@@ -288,52 +326,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_LRU_SORT (default: disabled)");
-static int damon_lru_sort_handle_commit_inputs(void)
-{
- int err;
-
- if (!commit_inputs)
- return 0;
-
- err = damon_lru_sort_apply_parameters();
- commit_inputs = false;
- return err;
-}
-
-static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
-{
- struct damos *s;
-
- /* update the stats parameter */
- damon_for_each_scheme(s, c) {
- if (s->action == DAMOS_LRU_PRIO)
- damon_lru_sort_hot_stat = s->stat;
- else if (s->action == DAMOS_LRU_DEPRIO)
- damon_lru_sort_cold_stat = s->stat;
- }
-
- return damon_lru_sort_handle_commit_inputs();
-}
-
-static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
-{
- return damon_lru_sort_handle_commit_inputs();
-}
-
static int __init damon_lru_sort_init(void)
{
int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
- return err;
+ goto out;
- ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
- ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+ call_control.data = ctx;
/* 'enabled' has set before this function, probably via command line */
if (enabled)
err = damon_lru_sort_turn(true);
+out:
+ if (err && enabled)
+ enabled = false;
return err;
}
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
index 7cf96574cde7..86d58f8c4f63 100644
--- a/mm/damon/modules-common.c
+++ b/mm/damon/modules-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index f49cdb417005..f103ad556368 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index d25d99cb5f2b..99321ff5cb92 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -1,15 +1,19 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
+#include <linux/migrate.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include "../internal.h"
#include "ops-common.h"
/*
@@ -24,7 +28,7 @@ struct folio *damon_get_folio(unsigned long pfn)
struct page *page = pfn_to_online_page(pfn);
struct folio *folio;
- if (!page || PageTail(page))
+ if (!page)
return NULL;
folio = page_folio(page);
@@ -39,12 +43,29 @@ struct folio *damon_get_folio(unsigned long pfn)
void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
{
- struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte)));
+ pte_t pteval = ptep_get(pte);
+ struct folio *folio;
+ bool young = false;
+ unsigned long pfn;
+
+ if (likely(pte_present(pteval)))
+ pfn = pte_pfn(pteval);
+ else
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ folio = damon_get_folio(pfn);
if (!folio)
return;
- if (ptep_clear_young_notify(vma, addr, pte))
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that actually map pages
+ * are "old" from a CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (likely(pte_present(pteval)))
+ young |= ptep_test_and_clear_young(vma, addr, pte);
+ young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
+ if (young)
folio_set_young(folio);
folio_set_idle(folio);
@@ -119,3 +140,275 @@ int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
/* Return coldness of the region */
return DAMOS_MAX_SCORE - hotness;
}
+
+static bool damon_folio_mkold_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte)
+ damon_ptep_mkold(pvmw.pte, vma, addr);
+ else
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
+ }
+ return true;
+}
+
+void damon_folio_mkold(struct folio *folio)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = damon_folio_mkold_one,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ folio_set_idle(folio);
+ return;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ return;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+}
+
+static bool damon_folio_young_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
+{
+ bool *accessed = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+ pte_t pte;
+
+ *accessed = false;
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ pte = ptep_get(pvmw.pte);
+
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
+ */
+ *accessed = (pte_present(pte) && pte_young(pte)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+#else
+ WARN_ON_ONCE(1);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ }
+ if (*accessed) {
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* If accessed, stop walking */
+ return *accessed == false;
+}
+
+bool damon_folio_young(struct folio *folio)
+{
+ bool accessed = false;
+ struct rmap_walk_control rwc = {
+ .arg = &accessed,
+ .rmap_one = damon_folio_young_one,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ if (folio_test_idle(folio))
+ return false;
+ else
+ return true;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ return false;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+ return accessed;
+}
+
+bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+ size_t folio_sz;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_ACTIVE:
+ matched = folio_test_active(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ case DAMOS_FILTER_TYPE_YOUNG:
+ matched = damon_folio_young(folio);
+ if (matched)
+ damon_folio_mkold(folio);
+ break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ folio_sz = folio_size(folio);
+ matched = filter->sz_range.min <= folio_sz &&
+ folio_sz <= filter->sz_range.max;
+ break;
+ case DAMOS_FILTER_TYPE_UNMAPPED:
+ matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+static unsigned int __damon_migrate_folio_list(
+ struct list_head *migrate_folios, struct pglist_data *pgdat,
+ int target_nid)
+{
+ unsigned int nr_succeeded = 0;
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = target_nid,
+ };
+
+ if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
+ return 0;
+
+ if (list_empty(migrate_folios))
+ return 0;
+
+ /* Migration ignores all cpuset and mempolicy settings */
+ migrate_pages(migrate_folios, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
+ &nr_succeeded);
+
+ return nr_succeeded;
+}
+
+static unsigned int damon_migrate_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat,
+ int target_nid)
+{
+ unsigned int nr_migrated = 0;
+ struct folio *folio;
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(migrate_folios);
+
+ while (!list_empty(folio_list)) {
+ struct folio *folio;
+
+ cond_resched();
+
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+
+ if (!folio_trylock(folio))
+ goto keep;
+
+ /* Relocate its contents to another node. */
+ list_add(&folio->lru, &migrate_folios);
+ folio_unlock(folio);
+ continue;
+keep:
+ list_add(&folio->lru, &ret_folios);
+ }
+ /* 'folio_list' is always empty here */
+
+ /* Migrate folios selected for migration */
+ nr_migrated += __damon_migrate_folio_list(
+ &migrate_folios, pgdat, target_nid);
+ /*
+ * Folios that could not be migrated are still in @migrate_folios. Add
+ * those back on @folio_list
+ */
+ if (!list_empty(&migrate_folios))
+ list_splice_init(&migrate_folios, folio_list);
+
+ try_to_unmap_flush();
+
+ list_splice(&ret_folios, folio_list);
+
+ while (!list_empty(folio_list)) {
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ return nr_migrated;
+}
+
+unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
+{
+ int nid;
+ unsigned long nr_migrated = 0;
+ LIST_HEAD(node_folio_list);
+ unsigned int noreclaim_flag;
+
+ if (list_empty(folio_list))
+ return nr_migrated;
+
+ if (target_nid < 0 || target_nid >= MAX_NUMNODES ||
+ !node_state(target_nid, N_MEMORY))
+ return nr_migrated;
+
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ nid = folio_nid(lru_to_folio(folio_list));
+ do {
+ struct folio *folio = lru_to_folio(folio_list);
+
+ if (nid == folio_nid(folio)) {
+ list_move(&folio->lru, &node_folio_list);
+ continue;
+ }
+
+ nr_migrated += damon_migrate_folio_list(&node_folio_list,
+ NODE_DATA(nid),
+ target_nid);
+ nid = folio_nid(lru_to_folio(folio_list));
+ } while (!list_empty(folio_list));
+
+ nr_migrated += damon_migrate_folio_list(&node_folio_list,
+ NODE_DATA(nid),
+ target_nid);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+
+ return nr_migrated;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 18d837d11bce..61ad54aaf256 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -11,8 +11,13 @@ struct folio *damon_get_folio(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
+void damon_folio_mkold(struct folio *folio);
+bool damon_folio_young(struct folio *folio);
int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
+
+bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
+unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a9ff35341d65..53a55c5114fb 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for The Physical Address Space
+ * DAMON Code for The Physical Address Space
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -13,51 +13,11 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/memory-tiers.h>
-#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include "../internal.h"
#include "ops-common.h"
-static bool damon_folio_mkold_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *arg)
-{
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
-
- while (page_vma_mapped_walk(&pvmw)) {
- addr = pvmw.address;
- if (pvmw.pte)
- damon_ptep_mkold(pvmw.pte, vma, addr);
- else
- damon_pmdp_mkold(pvmw.pmd, vma, addr);
- }
- return true;
-}
-
-static void damon_folio_mkold(struct folio *folio)
-{
- struct rmap_walk_control rwc = {
- .rmap_one = damon_folio_mkold_one,
- .anon_lock = folio_lock_anon_vma_read,
- };
- bool need_lock;
-
- if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
- folio_set_idle(folio);
- return;
- }
-
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
- return;
-
- rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
-
-}
-
static void damon_pa_mkold(unsigned long paddr)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -87,67 +47,6 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
}
}
-static bool damon_folio_young_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *arg)
-{
- bool *accessed = arg;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
-
- *accessed = false;
- while (page_vma_mapped_walk(&pvmw)) {
- addr = pvmw.address;
- if (pvmw.pte) {
- *accessed = pte_young(ptep_get(pvmw.pte)) ||
- !folio_test_idle(folio) ||
- mmu_notifier_test_young(vma->vm_mm, addr);
- } else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
- !folio_test_idle(folio) ||
- mmu_notifier_test_young(vma->vm_mm, addr);
-#else
- WARN_ON_ONCE(1);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- }
- if (*accessed) {
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- }
-
- /* If accessed, stop walking */
- return *accessed == false;
-}
-
-static bool damon_folio_young(struct folio *folio)
-{
- bool accessed = false;
- struct rmap_walk_control rwc = {
- .arg = &accessed,
- .rmap_one = damon_folio_young_one,
- .anon_lock = folio_lock_anon_vma_read,
- };
- bool need_lock;
-
- if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
- if (folio_test_idle(folio))
- return false;
- else
- return true;
- }
-
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
- return false;
-
- rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
-
- return accessed;
-}
-
static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -198,37 +97,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static bool __damos_pa_filter_out(struct damos_filter *filter,
- struct folio *folio)
-{
- bool matched = false;
- struct mem_cgroup *memcg;
-
- switch (filter->type) {
- case DAMOS_FILTER_TYPE_ANON:
- matched = folio_test_anon(folio);
- break;
- case DAMOS_FILTER_TYPE_MEMCG:
- rcu_read_lock();
- memcg = folio_memcg_check(folio);
- if (!memcg)
- matched = false;
- else
- matched = filter->memcg_id == mem_cgroup_id(memcg);
- rcu_read_unlock();
- break;
- case DAMOS_FILTER_TYPE_YOUNG:
- matched = damon_folio_young(folio);
- if (matched)
- damon_folio_mkold(folio);
- break;
- default:
- break;
- }
-
- return matched == filter->matching;
-}
-
/*
* damos_pa_filter_out - Return true if the page should be filtered out.
*/
@@ -236,42 +104,63 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
{
struct damos_filter *filter;
- damos_for_each_filter(filter, scheme) {
- if (__damos_pa_filter_out(filter, folio))
- return true;
+ if (scheme->core_filters_allowed)
+ return false;
+
+ damos_for_each_ops_filter(filter, scheme) {
+ if (damos_folio_filter_match(filter, folio))
+ return !filter->allow;
+ }
+ return scheme->ops_filters_default_reject;
+}
+
+static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
+{
+ if (!folio)
+ return true;
+ if (folio == s->last_applied) {
+ folio_put(folio);
+ return true;
}
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied;
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
+ struct folio *folio;
/* check access in page level again by default */
- damos_for_each_filter(filter, s) {
+ damos_for_each_ops_filter(filter, s) {
if (filter->type == DAMOS_FILTER_TYPE_YOUNG) {
install_young_filter = false;
break;
}
}
if (install_young_filter) {
- filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true);
+ filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_YOUNG, true, false);
if (!filter)
return 0;
damos_add_filter(s, filter);
}
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -282,28 +171,36 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
else
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
if (install_young_filter)
damos_destroy_filter(filter);
applied = reclaim_pages(&folio_list);
cond_resched();
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed)
+ struct damon_region *r, struct damos *s, bool mark_accessed,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied = 0;
+ struct folio *folio;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
if (mark_accessed)
folio_mark_accessed(folio);
@@ -311,186 +208,111 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
folio_deactivate(folio);
applied += folio_nr_pages(folio);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s)
+ struct damos *s, unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true);
+ return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s)
-{
- return damon_pa_mark_accessed_or_deactivate(r, s, false);
-}
-
-static unsigned int __damon_pa_migrate_folio_list(
- struct list_head *migrate_folios, struct pglist_data *pgdat,
- int target_nid)
+ struct damos *s, unsigned long *sz_filter_passed)
{
- unsigned int nr_succeeded = 0;
- nodemask_t allowed_mask = NODE_MASK_NONE;
- struct migration_target_control mtc = {
- /*
- * Allocate from 'node', or fail quickly and quietly.
- * When this happens, 'page' will likely just be discarded
- * instead of migrated.
- */
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
- .nmask = &allowed_mask
- };
-
- if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
- return 0;
-
- if (list_empty(migrate_folios))
- return 0;
-
- /* Migration ignores all cpuset and mempolicy settings */
- migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
- (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
- &nr_succeeded);
-
- return nr_succeeded;
+ return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ sz_filter_passed);
}
-static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
- struct pglist_data *pgdat,
- int target_nid)
+static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- unsigned int nr_migrated = 0;
+ unsigned long addr, applied;
+ LIST_HEAD(folio_list);
struct folio *folio;
- LIST_HEAD(ret_folios);
- LIST_HEAD(migrate_folios);
-
- while (!list_empty(folio_list)) {
- struct folio *folio;
- cond_resched();
-
- folio = lru_to_folio(folio_list);
- list_del(&folio->lru);
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
+ continue;
+ }
- if (!folio_trylock(folio))
- goto keep;
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
- /* Relocate its contents to another node. */
- list_add(&folio->lru, &migrate_folios);
- folio_unlock(folio);
- continue;
-keep:
- list_add(&folio->lru, &ret_folios);
- }
- /* 'folio_list' is always empty here */
-
- /* Migrate folios selected for migration */
- nr_migrated += __damon_pa_migrate_folio_list(
- &migrate_folios, pgdat, target_nid);
- /*
- * Folios that could not be migrated are still in @migrate_folios. Add
- * those back on @folio_list
- */
- if (!list_empty(&migrate_folios))
- list_splice_init(&migrate_folios, folio_list);
-
- try_to_unmap_flush();
-
- list_splice(&ret_folios, folio_list);
-
- while (!list_empty(folio_list)) {
- folio = lru_to_folio(folio_list);
- list_del(&folio->lru);
- folio_putback_lru(folio);
+ if (!folio_isolate_lru(folio))
+ goto put_folio;
+ list_add(&folio->lru, &folio_list);
+put_folio:
+ addr += folio_size(folio);
+ folio_put(folio);
}
-
- return nr_migrated;
+ applied = damon_migrate_pages(&folio_list, s->target_nid);
+ cond_resched();
+ s->last_applied = folio;
+ return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
- int target_nid)
+static bool damon_pa_scheme_has_filter(struct damos *s)
{
- int nid;
- unsigned long nr_migrated = 0;
- LIST_HEAD(node_folio_list);
- unsigned int noreclaim_flag;
-
- if (list_empty(folio_list))
- return nr_migrated;
-
- noreclaim_flag = memalloc_noreclaim_save();
-
- nid = folio_nid(lru_to_folio(folio_list));
- do {
- struct folio *folio = lru_to_folio(folio_list);
-
- if (nid == folio_nid(folio)) {
- list_move(&folio->lru, &node_folio_list);
- continue;
- }
+ struct damos_filter *f;
- nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
- NODE_DATA(nid),
- target_nid);
- nid = folio_nid(lru_to_folio(folio_list));
- } while (!list_empty(folio_list));
-
- nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
- NODE_DATA(nid),
- target_nid);
-
- memalloc_noreclaim_restore(noreclaim_flag);
-
- return nr_migrated;
+ damos_for_each_ops_filter(f, s)
+ return true;
+ return false;
}
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s)
+static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
- LIST_HEAD(folio_list);
+ unsigned long addr;
+ struct folio *folio;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+ if (!damon_pa_scheme_has_filter(s))
+ return 0;
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
- if (damos_pa_filter_out(s, folio))
- goto put_folio;
-
- if (!folio_isolate_lru(folio))
- goto put_folio;
- list_add(&folio->lru, &folio_list);
-put_folio:
+ if (!damos_pa_filter_out(s, folio))
+ *sz_filter_passed += folio_size(folio);
+ addr += folio_size(folio);
folio_put(folio);
}
- applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
- cond_resched();
- return applied * PAGE_SIZE;
+ s->last_applied = folio;
+ return 0;
}
-
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+ struct damos *scheme, unsigned long *sz_filter_passed)
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme);
+ return damon_pa_pageout(r, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme);
+ return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme);
+ return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme);
+ return damon_pa_migrate(r, scheme, sz_filter_passed);
case DAMOS_STAT:
- break;
+ return damon_pa_stat(r, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
@@ -528,7 +350,6 @@ static int __init damon_pa_initcall(void)
.update = NULL,
.prepare_access_checks = damon_pa_prepare_access_checks,
.check_accesses = damon_pa_check_accesses,
- .reset_aggregated = NULL,
.target_valid = NULL,
.cleanup = NULL,
.apply_scheme = damon_pa_apply_scheme,
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 9e0077a9404e..fb7c982a0018 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -194,7 +194,12 @@ static int damon_reclaim_apply_parameters(void)
if (err)
return err;
- err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
+ if (!damon_reclaim_mon_attrs.aggr_interval) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
if (err)
goto out;
@@ -202,7 +207,7 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
goto out;
- damon_set_schemes(ctx, &scheme, 1);
+ damon_set_schemes(param_ctx, &scheme, 1);
if (quota_mem_pressure_us) {
goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US,
@@ -221,7 +226,7 @@ static int damon_reclaim_apply_parameters(void)
}
if (skip_anon) {
- filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
if (!filter)
goto out;
damos_add_filter(scheme, filter);
@@ -238,6 +243,35 @@ out:
return err;
}
+static int damon_reclaim_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_reclaim_damon_call_fn(void *arg)
+{
+ struct damon_ctx *c = arg;
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c)
+ damon_reclaim_stat = s->stat;
+
+ return damon_reclaim_handle_commit_inputs();
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_reclaim_damon_call_fn,
+ .repeat = true,
+};
+
static int damon_reclaim_turn(bool on)
{
int err;
@@ -257,7 +291,7 @@ static int damon_reclaim_turn(bool on)
if (err)
return err;
kdamond_pid = ctx->kdamond->pid;
- return 0;
+ return damon_call(ctx, &call_control);
}
static int damon_reclaim_enabled_store(const char *val,
@@ -296,48 +330,22 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_RECLAIM (default: disabled)");
-static int damon_reclaim_handle_commit_inputs(void)
-{
- int err;
-
- if (!commit_inputs)
- return 0;
-
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- return err;
-}
-
-static int damon_reclaim_after_aggregation(struct damon_ctx *c)
-{
- struct damos *s;
-
- /* update the stats parameter */
- damon_for_each_scheme(s, c)
- damon_reclaim_stat = s->stat;
-
- return damon_reclaim_handle_commit_inputs();
-}
-
-static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
-{
- return damon_reclaim_handle_commit_inputs();
-}
-
static int __init damon_reclaim_init(void)
{
int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
- return err;
+ goto out;
- ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
- ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
+ call_control.data = ctx;
/* 'enabled' has set before this function, probably via command line */
if (enabled)
err = damon_reclaim_turn(true);
+out:
+ if (err && enabled)
+ enabled = false;
return err;
}
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
new file mode 100644
index 000000000000..d12e423da9c2
--- /dev/null
+++ b/mm/damon/stat.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shows data access monitoring resutls in simple metrics.
+ */
+
+#define pr_fmt(fmt) "damon-stat: " fmt
+
+#include <linux/damon.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_stat."
+
+static int damon_stat_enabled_store(
+ const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_stat_enabled_store,
+ .get = param_get_bool,
+};
+
+static bool enabled __read_mostly = IS_ENABLED(
+ CONFIG_DAMON_STAT_ENABLED_DEFAULT);
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT");
+
+static unsigned long estimated_memory_bandwidth __read_mostly;
+module_param(estimated_memory_bandwidth, ulong, 0400);
+MODULE_PARM_DESC(estimated_memory_bandwidth,
+ "Estimated memory bandwidth usage in bytes per second");
+
+static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,};
+module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400);
+MODULE_PARM_DESC(memory_idle_ms_percentiles,
+ "Memory idle time percentiles in milliseconds");
+
+static struct damon_ctx *damon_stat_context;
+
+static unsigned long damon_stat_last_refresh_jiffies;
+
+static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long access_bytes = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t)
+ access_bytes += (r->ar.end - r->ar.start) *
+ r->nr_accesses;
+ }
+ estimated_memory_bandwidth = access_bytes * USEC_PER_MSEC *
+ MSEC_PER_SEC / c->attrs.aggr_interval;
+}
+
+static unsigned int damon_stat_idletime(const struct damon_region *r)
+{
+ if (r->nr_accesses)
+ return 0;
+ return r->age + 1;
+}
+
+static int damon_stat_cmp_regions(const void *a, const void *b)
+{
+ const struct damon_region *ra = *(const struct damon_region **)a;
+ const struct damon_region *rb = *(const struct damon_region **)b;
+
+ return damon_stat_idletime(ra) - damon_stat_idletime(rb);
+}
+
+static int damon_stat_sort_regions(struct damon_ctx *c,
+ struct damon_region ***sorted_ptr, int *nr_regions_ptr,
+ unsigned long *total_sz_ptr)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ struct damon_region **region_pointers;
+ unsigned int nr_regions = 0;
+ unsigned long total_sz = 0;
+
+ damon_for_each_target(t, c) {
+ /* there is only one target */
+ region_pointers = kmalloc_array(damon_nr_regions(t),
+ sizeof(*region_pointers), GFP_KERNEL);
+ if (!region_pointers)
+ return -ENOMEM;
+ damon_for_each_region(r, t) {
+ region_pointers[nr_regions++] = r;
+ total_sz += r->ar.end - r->ar.start;
+ }
+ }
+ sort(region_pointers, nr_regions, sizeof(*region_pointers),
+ damon_stat_cmp_regions, NULL);
+ *sorted_ptr = region_pointers;
+ *nr_regions_ptr = nr_regions;
+ *total_sz_ptr = total_sz;
+ return 0;
+}
+
+static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
+{
+ struct damon_region **sorted_regions, *region;
+ int nr_regions;
+ unsigned long total_sz, accounted_bytes = 0;
+ int err, i, next_percentile = 0;
+
+ err = damon_stat_sort_regions(c, &sorted_regions, &nr_regions,
+ &total_sz);
+ if (err)
+ return;
+ for (i = 0; i < nr_regions; i++) {
+ region = sorted_regions[i];
+ accounted_bytes += region->ar.end - region->ar.start;
+ while (next_percentile <= accounted_bytes * 100 / total_sz)
+ memory_idle_ms_percentiles[next_percentile++] =
+ damon_stat_idletime(region) *
+ c->attrs.aggr_interval / USEC_PER_MSEC;
+ }
+ kfree(sorted_regions);
+}
+
+static int damon_stat_damon_call_fn(void *data)
+{
+ struct damon_ctx *c = data;
+
+ /* avoid unnecessarily frequent stat update */
+ if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies +
+ msecs_to_jiffies(5 * MSEC_PER_SEC)))
+ return 0;
+ damon_stat_last_refresh_jiffies = jiffies;
+
+ damon_stat_set_estimated_memory_bandwidth(c);
+ damon_stat_set_idletime_percentiles(c);
+ return 0;
+}
+
+static struct damon_ctx *damon_stat_build_ctx(void)
+{
+ struct damon_ctx *ctx;
+ struct damon_attrs attrs;
+ struct damon_target *target;
+ unsigned long start = 0, end = 0;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return NULL;
+ attrs = (struct damon_attrs) {
+ .sample_interval = 5 * USEC_PER_MSEC,
+ .aggr_interval = 100 * USEC_PER_MSEC,
+ .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+ };
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_set_attrs(ctx, &attrs))
+ goto free_out;
+
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ ctx->attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ goto free_out;
+
+ target = damon_new_target();
+ if (!target)
+ goto free_out;
+ damon_add_target(ctx, target);
+ if (damon_set_region_biggest_system_ram_default(target, &start, &end))
+ goto free_out;
+ return ctx;
+free_out:
+ damon_destroy_ctx(ctx);
+ return NULL;
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_stat_damon_call_fn,
+ .repeat = true,
+};
+
+static int damon_stat_start(void)
+{
+ int err;
+
+ damon_stat_context = damon_stat_build_ctx();
+ if (!damon_stat_context)
+ return -ENOMEM;
+ err = damon_start(&damon_stat_context, 1, true);
+ if (err)
+ return err;
+
+ damon_stat_last_refresh_jiffies = jiffies;
+ call_control.data = damon_stat_context;
+ return damon_call(damon_stat_context, &call_control);
+}
+
+static void damon_stat_stop(void)
+{
+ damon_stop(&damon_stat_context, 1);
+ damon_destroy_ctx(damon_stat_context);
+}
+
+static bool damon_stat_init_called;
+
+static int damon_stat_enabled_store(
+ const char *val, const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ int err;
+
+ err = kstrtobool(val, &enabled);
+ if (err)
+ return err;
+
+ if (is_enabled == enabled)
+ return 0;
+
+ if (!damon_stat_init_called)
+ /*
+ * probably called from command line parsing (parse_args()).
+ * Cannot call damon_new_ctx(). Let damon_stat_init() handle.
+ */
+ return 0;
+
+ if (enabled) {
+ err = damon_stat_start();
+ if (err)
+ enabled = false;
+ return err;
+ }
+ damon_stat_stop();
+ return 0;
+}
+
+static int __init damon_stat_init(void)
+{
+ int err = 0;
+
+ damon_stat_init_called = true;
+
+ /* probably set via command line */
+ if (enabled)
+ err = damon_stat_start();
+
+ if (err && enabled)
+ enabled = false;
+ return err;
+}
+
+module_init(damon_stat_init);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 70edf45c2174..ffaf285e241a 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 9a18f3c535d3..2099adee11d0 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
-int damon_sysfs_schemes_update_regions_start(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx, bool total_bytes_only);
-
-void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
-
-bool damos_sysfs_regions_upd_done(void);
-
-int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s,
+ bool total_bytes_only, unsigned long sz_filter_passed);
int damon_sysfs_schemes_clear_regions(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx);
+ struct damon_sysfs_schemes *sysfs_schemes);
int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index b095457380b5..6536f16006c9 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region {
struct damon_addr_range ar;
unsigned int nr_accesses;
unsigned int age;
+ unsigned long sz_filter_passed;
struct list_head list;
};
@@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
return sysfs_emit(buf, "%u\n", region->age);
}
+static ssize_t sz_filter_passed_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->sz_filter_passed);
+}
+
static void damon_sysfs_scheme_region_release(struct kobject *kobj)
{
struct damon_sysfs_scheme_region *region = container_of(kobj,
@@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
__ATTR_RO_MODE(age, 0400);
+static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr =
+ __ATTR_RO_MODE(sz_filter_passed, 0400);
+
static struct attribute *damon_sysfs_scheme_region_attrs[] = {
&damon_sysfs_scheme_region_start_attr.attr,
&damon_sysfs_scheme_region_end_attr.attr,
&damon_sysfs_scheme_region_nr_accesses_attr.attr,
&damon_sysfs_scheme_region_age_attr.attr,
+ &damon_sysfs_scheme_region_sz_filter_passed_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
@@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
* scheme regions directory
*/
-/*
- * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update
- * status
- * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request.
- * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started.
- * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished.
- *
- * Each DAMON-based operation scheme (&struct damos) has its own apply
- * interval, and we need to expose the scheme tried regions based on only
- * single snapshot. For this, we keep the tried regions update status for each
- * scheme. The status becomes 'idle' at the beginning.
- *
- * Once the tried regions update request is received, the request handling
- * start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply()
- * callback.
- *
- * Then, the first followup ->before_damos_apply() callback
- * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
- * ->after_sampling() or ->after_aggregation() callback
- * (damon_sysfs_cmd_request_callback()) after the call is called only after
- * the scheme is completely applied to the given snapshot. Hence the callback
- * knows the situation by showing 'started' status, and sets the status as
- * 'finished'. Then, damon_sysfs_before_damos_apply() understands the
- * situation by showing the 'finished' status and do nothing.
- *
- * If DAMOS is not applied to any region due to any reasons including the
- * access pattern, the watermarks, the quotas, and the filters,
- * ->before_damos_apply() will not be called back. Until the situation is
- * changed, the update will not be finished. To avoid this,
- * damon_sysfs_after_sampling() set the status as 'finished' if more than two
- * apply intervals of the scheme is passed while the state is 'idle'.
- *
- * Finally, the tried regions request handling finisher function
- * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks.
- */
-enum damos_sysfs_regions_upd_status {
- DAMOS_TRIED_REGIONS_UPD_IDLE,
- DAMOS_TRIED_REGIONS_UPD_STARTED,
- DAMOS_TRIED_REGIONS_UPD_FINISHED,
-};
-
struct damon_sysfs_scheme_regions {
struct kobject kobj;
struct list_head regions_list;
int nr_regions;
unsigned long total_bytes;
- enum damos_sysfs_regions_upd_status upd_status;
- unsigned long upd_timeout_jiffies;
};
static struct damon_sysfs_scheme_regions *
@@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void)
INIT_LIST_HEAD(&regions->regions_list);
regions->nr_regions = 0;
regions->total_bytes = 0;
- regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
return regions;
}
@@ -233,6 +202,7 @@ struct damon_sysfs_stats {
unsigned long sz_tried;
unsigned long nr_applied;
unsigned long sz_applied;
+ unsigned long sz_ops_filter_passed;
unsigned long qt_exceeds;
};
@@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj,
return sysfs_emit(buf, "%lu\n", stats->sz_applied);
}
+static ssize_t sz_ops_filter_passed_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed);
+}
+
static ssize_t qt_exceeds_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
__ATTR_RO_MODE(sz_applied, 0400);
+static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr =
+ __ATTR_RO_MODE(sz_ops_filter_passed, 0400);
+
static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
__ATTR_RO_MODE(qt_exceeds, 0400);
@@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = {
&damon_sysfs_stats_sz_tried_attr.attr,
&damon_sysfs_stats_nr_applied_attr.attr,
&damon_sysfs_stats_sz_applied_attr.attr,
+ &damon_sysfs_stats_sz_ops_filter_passed_attr.attr,
&damon_sysfs_stats_qt_exceeds_attr.attr,
NULL,
};
@@ -326,27 +309,77 @@ static const struct kobj_type damon_sysfs_stats_ktype = {
* filter directory
*/
+/*
+ * enum damos_sysfs_filter_handle_layer - Layers handling filters of a dir.
+ */
+enum damos_sysfs_filter_handle_layer {
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH,
+};
+
struct damon_sysfs_scheme_filter {
struct kobject kobj;
+ enum damos_sysfs_filter_handle_layer handle_layer;
enum damos_filter_type type;
bool matching;
+ bool allow;
char *memcg_path;
struct damon_addr_range addr_range;
+ struct damon_size_range sz_range;
int target_idx;
};
-static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(
+ enum damos_sysfs_filter_handle_layer layer)
{
- return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+ struct damon_sysfs_scheme_filter *filter;
+
+ filter = kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+ if (filter)
+ filter->handle_layer = layer;
+ return filter;
}
-/* Should match with enum damos_filter_type */
-static const char * const damon_sysfs_scheme_filter_type_strs[] = {
- "anon",
- "memcg",
- "young",
- "addr",
- "target",
+struct damos_sysfs_filter_type_name {
+ enum damos_filter_type type;
+ char *name;
+};
+
+static const struct damos_sysfs_filter_type_name
+damos_sysfs_filter_type_names[] = {
+ {
+ .type = DAMOS_FILTER_TYPE_ANON,
+ .name = "anon",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_ACTIVE,
+ .name = "active",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_MEMCG,
+ .name = "memcg",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_YOUNG,
+ .name = "young",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
+ .name = "hugepage_size",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_UNMAPPED,
+ .name = "unmapped",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_ADDR,
+ .name = "addr",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_TARGET,
+ .name = "target",
+ },
};
static ssize_t type_show(struct kobject *kobj,
@@ -354,9 +387,33 @@ static ssize_t type_show(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_scheme_filter_type_strs[filter->type]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) {
+ const struct damos_sysfs_filter_type_name *type_name;
+
+ type_name = &damos_sysfs_filter_type_names[i];
+ if (type_name->type == filter->type)
+ return sysfs_emit(buf, "%s\n", type_name->name);
+ }
+ return -EINVAL;
+}
+
+static bool damos_sysfs_scheme_filter_valid_type(
+ enum damos_sysfs_filter_handle_layer layer,
+ enum damos_filter_type type)
+{
+ switch (layer) {
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH:
+ return true;
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE:
+ return !damos_filter_for_ops(type);
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS:
+ return damos_filter_for_ops(type);
+ default:
+ break;
+ }
+ return false;
}
static ssize_t type_store(struct kobject *kobj,
@@ -364,13 +421,19 @@ static ssize_t type_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- enum damos_filter_type type;
ssize_t ret = -EINVAL;
+ int i;
- for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
- if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
- type])) {
- filter->type = type;
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) {
+ const struct damos_sysfs_filter_type_name *type_name;
+
+ type_name = &damos_sysfs_filter_type_names[i];
+ if (sysfs_streq(buf, type_name->name)) {
+ if (!damos_sysfs_scheme_filter_valid_type(
+ filter->handle_layer,
+ type_name->type))
+ break;
+ filter->type = type_name->type;
ret = count;
break;
}
@@ -402,6 +465,30 @@ static ssize_t matching_store(struct kobject *kobj,
return count;
}
+static ssize_t allow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N');
+}
+
+static ssize_t allow_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool allow;
+ int err = kstrtobool(buf, &allow);
+
+ if (err)
+ return err;
+
+ filter->allow = allow;
+ return count;
+}
+
static ssize_t memcg_path_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -417,12 +504,14 @@ static ssize_t memcg_path_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+ char *path = kmalloc_array(size_add(count, 1), sizeof(*path),
+ GFP_KERNEL);
if (!path)
return -ENOMEM;
strscpy(path, buf, count + 1);
+ kfree(filter->memcg_path);
filter->memcg_path = path;
return count;
}
@@ -465,6 +554,44 @@ static ssize_t addr_end_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t min_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->sz_range.min);
+}
+
+static ssize_t min_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->sz_range.min);
+
+ return err ? err : count;
+}
+
+static ssize_t max_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->sz_range.max);
+}
+
+static ssize_t max_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->sz_range.max);
+
+ return err ? err : count;
+}
+
static ssize_t damon_target_idx_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -499,6 +626,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
__ATTR_RW_MODE(matching, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr =
+ __ATTR_RW_MODE(allow, 0600);
+
static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
__ATTR_RW_MODE(memcg_path, 0600);
@@ -508,15 +638,24 @@ static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr =
__ATTR_RW_MODE(addr_end, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr =
__ATTR_RW_MODE(damon_target_idx, 0600);
static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
&damon_sysfs_scheme_filter_type_attr.attr,
&damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_allow_attr.attr,
&damon_sysfs_scheme_filter_memcg_path_attr.attr,
&damon_sysfs_scheme_filter_addr_start_attr.attr,
&damon_sysfs_scheme_filter_addr_end_attr.attr,
+ &damon_sysfs_scheme_filter_min_attr.attr,
+ &damon_sysfs_scheme_filter_max_attr.attr,
&damon_sysfs_scheme_filter_damon_target_idx_attr.attr,
NULL,
};
@@ -534,14 +673,20 @@ static const struct kobj_type damon_sysfs_scheme_filter_ktype = {
struct damon_sysfs_scheme_filters {
struct kobject kobj;
+ enum damos_sysfs_filter_handle_layer handle_layer;
struct damon_sysfs_scheme_filter **filters_arr;
int nr;
};
static struct damon_sysfs_scheme_filters *
-damon_sysfs_scheme_filters_alloc(void)
+damon_sysfs_scheme_filters_alloc(enum damos_sysfs_filter_handle_layer layer)
{
- return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+ struct damon_sysfs_scheme_filters *filters;
+
+ filters = kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+ if (filters)
+ filters->handle_layer = layer;
+ return filters;
}
static void damon_sysfs_scheme_filters_rm_dirs(
@@ -574,7 +719,8 @@ static int damon_sysfs_scheme_filters_add_dirs(
filters->filters_arr = filters_arr;
for (i = 0; i < nr_filters; i++) {
- filter = damon_sysfs_scheme_filter_alloc();
+ filter = damon_sysfs_scheme_filter_alloc(
+ filters->handle_layer);
if (!filter) {
damon_sysfs_scheme_filters_rm_dirs(filters);
return -ENOMEM;
@@ -678,10 +824,21 @@ static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
return watermarks;
}
-/* Should match with enum damos_wmark_metric */
-static const char * const damon_sysfs_wmark_metric_strs[] = {
- "none",
- "free_mem_rate",
+struct damos_sysfs_wmark_metric_name {
+ enum damos_wmark_metric metric;
+ char *name;
+};
+
+static const struct damos_sysfs_wmark_metric_name
+damos_sysfs_wmark_metric_names[] = {
+ {
+ .metric = DAMOS_WMARK_NONE,
+ .name = "none",
+ },
+ {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .name = "free_mem_rate",
+ },
};
static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -689,9 +846,16 @@ static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_watermarks *watermarks = container_of(kobj,
struct damon_sysfs_watermarks, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_wmark_metric_strs[watermarks->metric]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) {
+ const struct damos_sysfs_wmark_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_wmark_metric_names[i];
+ if (metric_name->metric == watermarks->metric)
+ return sysfs_emit(buf, "%s\n", metric_name->name);
+ }
+ return -EINVAL;
}
static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -699,11 +863,14 @@ static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_watermarks *watermarks = container_of(kobj,
struct damon_sysfs_watermarks, kobj);
- enum damos_wmark_metric metric;
+ int i;
- for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
- if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
- watermarks->metric = metric;
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) {
+ const struct damos_sysfs_wmark_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_wmark_metric_names[i];
+ if (sysfs_streq(buf, metric_name->name)) {
+ watermarks->metric = metric_name->metric;
return count;
}
}
@@ -831,12 +998,7 @@ struct damos_sysfs_quota_goal {
enum damos_quota_goal_metric metric;
unsigned long target_value;
unsigned long current_value;
-};
-
-/* This should match with enum damos_action */
-static const char * const damos_sysfs_quota_goal_metric_strs[] = {
- "user_input",
- "some_mem_psi_us",
+ int nid;
};
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
@@ -844,14 +1006,46 @@ static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL);
}
+struct damos_sysfs_qgoal_metric_name {
+ enum damos_quota_goal_metric metric;
+ char *name;
+};
+
+static
+struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .name = "user_input",
+ },
+ {
+ .metric = DAMOS_QUOTA_SOME_MEM_PSI_US,
+ .name = "some_mem_psi_us",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEM_USED_BP,
+ .name = "node_mem_used_bp",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP,
+ .name = "node_mem_free_bp",
+ },
+};
+
static ssize_t target_metric_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damos_sysfs_quota_goal_metric_strs[goal->metric]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) {
+ struct damos_sysfs_qgoal_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_qgoal_metric_names[i];
+ if (metric_name->metric == goal->metric)
+ return sysfs_emit(buf, "%s\n", metric_name->name);
+ }
+ return -EINVAL;
}
static ssize_t target_metric_store(struct kobject *kobj,
@@ -859,11 +1053,14 @@ static ssize_t target_metric_store(struct kobject *kobj,
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
- enum damos_quota_goal_metric m;
+ int i;
- for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) {
- if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) {
- goal->metric = m;
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) {
+ struct damos_sysfs_qgoal_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_qgoal_metric_names[i];
+ if (sysfs_streq(buf, metric_name->name)) {
+ goal->metric = metric_name->metric;
return count;
}
}
@@ -909,6 +1106,28 @@ static ssize_t current_value_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t nid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+
+ /* todo: return error if the goal is not using nid */
+
+ return sysfs_emit(buf, "%d\n", goal->nid);
+}
+
+static ssize_t nid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+ int err = kstrtoint(buf, 0, &goal->nid);
+
+ /* feed callback should check existence of this file and read value */
+ return err ? err : count;
+}
+
static void damos_sysfs_quota_goal_release(struct kobject *kobj)
{
/* or, notify this release to the feed callback */
@@ -924,10 +1143,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
__ATTR_RW_MODE(current_value, 0600);
+static struct kobj_attribute damos_sysfs_quota_goal_nid_attr =
+ __ATTR_RW_MODE(nid, 0600);
+
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
&damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
+ &damos_sysfs_quota_goal_nid_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damos_sysfs_quota_goal);
@@ -1367,7 +1590,7 @@ static int damon_sysfs_access_pattern_add_range_dir(
if (!range)
return -ENOMEM;
err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
- &access_pattern->kobj, name);
+ &access_pattern->kobj, "%s", name);
if (err)
kobject_put(&range->kobj);
else
@@ -1433,6 +1656,204 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = {
};
/*
+ * dest (action destination) directory
+ */
+
+struct damos_sysfs_dest {
+ struct kobject kobj;
+ unsigned int id;
+ unsigned int weight;
+};
+
+static struct damos_sysfs_dest *damos_sysfs_dest_alloc(void)
+{
+ return kzalloc(sizeof(struct damos_sysfs_dest), GFP_KERNEL);
+}
+
+static ssize_t id_show(
+ struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+
+ return sysfs_emit(buf, "%u\n", dest->id);
+}
+
+static ssize_t id_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ int err = kstrtouint(buf, 0, &dest->id);
+
+ return err ? err : count;
+}
+
+static ssize_t weight_show(
+ struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+
+ return sysfs_emit(buf, "%u\n", dest->weight);
+}
+
+static ssize_t weight_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ int err = kstrtouint(buf, 0, &dest->weight);
+
+ return err ? err : count;
+}
+
+static void damos_sysfs_dest_release(struct kobject *kobj)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ kfree(dest);
+}
+
+static struct kobj_attribute damos_sysfs_dest_id_attr =
+ __ATTR_RW_MODE(id, 0600);
+
+static struct kobj_attribute damos_sysfs_dest_weight_attr =
+ __ATTR_RW_MODE(weight, 0600);
+
+static struct attribute *damos_sysfs_dest_attrs[] = {
+ &damos_sysfs_dest_id_attr.attr,
+ &damos_sysfs_dest_weight_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_dest);
+
+static const struct kobj_type damos_sysfs_dest_ktype = {
+ .release = damos_sysfs_dest_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_dest_groups,
+};
+
+/*
+ * dests (action destinations) directory
+ */
+
+struct damos_sysfs_dests {
+ struct kobject kobj;
+ struct damos_sysfs_dest **dests_arr;
+ int nr;
+};
+
+static struct damos_sysfs_dests *
+damos_sysfs_dests_alloc(void)
+{
+ return kzalloc(sizeof(struct damos_sysfs_dests), GFP_KERNEL);
+}
+
+static void damos_sysfs_dests_rm_dirs(
+ struct damos_sysfs_dests *dests)
+{
+ struct damos_sysfs_dest **dests_arr = dests->dests_arr;
+ int i;
+
+ for (i = 0; i < dests->nr; i++)
+ kobject_put(&dests_arr[i]->kobj);
+ dests->nr = 0;
+ kfree(dests_arr);
+ dests->dests_arr = NULL;
+}
+
+static int damos_sysfs_dests_add_dirs(
+ struct damos_sysfs_dests *dests, int nr_dests)
+{
+ struct damos_sysfs_dest **dests_arr, *dest;
+ int err, i;
+
+ damos_sysfs_dests_rm_dirs(dests);
+ if (!nr_dests)
+ return 0;
+
+ dests_arr = kmalloc_array(nr_dests, sizeof(*dests_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!dests_arr)
+ return -ENOMEM;
+ dests->dests_arr = dests_arr;
+
+ for (i = 0; i < nr_dests; i++) {
+ dest = damos_sysfs_dest_alloc();
+ if (!dest) {
+ damos_sysfs_dests_rm_dirs(dests);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&dest->kobj,
+ &damos_sysfs_dest_ktype,
+ &dests->kobj, "%d", i);
+ if (err) {
+ kobject_put(&dest->kobj);
+ damos_sysfs_dests_rm_dirs(dests);
+ return err;
+ }
+
+ dests_arr[i] = dest;
+ dests->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_dests_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dests *dests = container_of(kobj,
+ struct damos_sysfs_dests, kobj);
+
+ return sysfs_emit(buf, "%d\n", dests->nr);
+}
+
+static ssize_t nr_dests_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dests *dests;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ dests = container_of(kobj, struct damos_sysfs_dests, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damos_sysfs_dests_add_dirs(dests, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damos_sysfs_dests_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damos_sysfs_dests, kobj));
+}
+
+static struct kobj_attribute damos_sysfs_dests_nr_attr =
+ __ATTR_RW_MODE(nr_dests, 0600);
+
+static struct attribute *damos_sysfs_dests_attrs[] = {
+ &damos_sysfs_dests_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_dests);
+
+static const struct kobj_type damos_sysfs_dests_ktype = {
+ .release = damos_sysfs_dests_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_dests_groups,
+};
+
+/*
* scheme directory
*/
@@ -1443,24 +1864,61 @@ struct damon_sysfs_scheme {
unsigned long apply_interval_us;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *core_filters;
+ struct damon_sysfs_scheme_filters *ops_filters;
struct damon_sysfs_scheme_filters *filters;
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
int target_nid;
+ struct damos_sysfs_dests *dests;
+};
+
+struct damos_sysfs_action_name {
+ enum damos_action action;
+ char *name;
};
-/* This should match with enum damos_action */
-static const char * const damon_sysfs_damos_action_strs[] = {
- "willneed",
- "cold",
- "pageout",
- "hugepage",
- "nohugepage",
- "lru_prio",
- "lru_deprio",
- "migrate_hot",
- "migrate_cold",
- "stat",
+static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
+ {
+ .action = DAMOS_WILLNEED,
+ .name = "willneed",
+ },
+ {
+ .action = DAMOS_COLD,
+ .name = "cold",
+ },
+ {
+ .action = DAMOS_PAGEOUT,
+ .name = "pageout",
+ },
+ {
+ .action = DAMOS_HUGEPAGE,
+ .name = "hugepage",
+ },
+ {
+ .action = DAMOS_NOHUGEPAGE,
+ .name = "nohugepage",
+ },
+ {
+ .action = DAMOS_LRU_PRIO,
+ .name = "lru_prio",
+ },
+ {
+ .action = DAMOS_LRU_DEPRIO,
+ .name = "lru_deprio",
+ },
+ {
+ .action = DAMOS_MIGRATE_HOT,
+ .name = "migrate_hot",
+ },
+ {
+ .action = DAMOS_MIGRATE_COLD,
+ .name = "migrate_cold",
+ },
+ {
+ .action = DAMOS_STAT,
+ .name = "stat",
+ },
};
static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
@@ -1503,6 +1961,22 @@ out:
return err;
}
+static int damos_sysfs_set_dests(struct damon_sysfs_scheme *scheme)
+{
+ struct damos_sysfs_dests *dests = damos_sysfs_dests_alloc();
+ int err;
+
+ if (!dests)
+ return -ENOMEM;
+ err = kobject_init_and_add(&dests->kobj, &damos_sysfs_dests_ktype,
+ &scheme->kobj, "dests");
+ if (err)
+ kobject_put(&dests->kobj);
+ else
+ scheme->dests = dests;
+ return err;
+}
+
static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
{
struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
@@ -1543,21 +2017,53 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
return err;
}
-static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme,
+ enum damos_sysfs_filter_handle_layer layer, const char *name,
+ struct damon_sysfs_scheme_filters **filters_ptr)
{
struct damon_sysfs_scheme_filters *filters =
- damon_sysfs_scheme_filters_alloc();
+ damon_sysfs_scheme_filters_alloc(layer);
int err;
if (!filters)
return -ENOMEM;
err = kobject_init_and_add(&filters->kobj,
&damon_sysfs_scheme_filters_ktype, &scheme->kobj,
- "filters");
+ "%s", name);
if (err)
kobject_put(&filters->kobj);
else
- scheme->filters = filters;
+ *filters_ptr = filters;
+ return err;
+}
+
+static int damos_sysfs_set_filter_dirs(struct damon_sysfs_scheme *scheme)
+{
+ int err;
+
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, "filters",
+ &scheme->filters);
+ if (err)
+ return err;
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, "core_filters",
+ &scheme->core_filters);
+ if (err)
+ goto put_filters_out;
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, "ops_filters",
+ &scheme->ops_filters);
+ if (err)
+ goto put_core_filters_out;
+ return 0;
+
+put_core_filters_out:
+ kobject_put(&scheme->core_filters->kobj);
+ scheme->core_filters = NULL;
+put_filters_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
return err;
}
@@ -1603,13 +2109,16 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_access_pattern(scheme);
if (err)
return err;
- err = damon_sysfs_scheme_set_quotas(scheme);
+ err = damos_sysfs_set_dests(scheme);
if (err)
goto put_access_pattern_out;
+ err = damon_sysfs_scheme_set_quotas(scheme);
+ if (err)
+ goto put_dests_out;
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_filters(scheme);
+ err = damos_sysfs_set_filter_dirs(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
err = damon_sysfs_scheme_set_stats(scheme);
@@ -1624,6 +2133,10 @@ put_tried_regions_out:
kobject_put(&scheme->tried_regions->kobj);
scheme->tried_regions = NULL;
put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->ops_filters->kobj);
+ scheme->ops_filters = NULL;
+ kobject_put(&scheme->core_filters->kobj);
+ scheme->core_filters = NULL;
kobject_put(&scheme->filters->kobj);
scheme->filters = NULL;
put_watermarks_quotas_access_pattern_out:
@@ -1632,6 +2145,9 @@ put_watermarks_quotas_access_pattern_out:
put_quotas_access_pattern_out:
kobject_put(&scheme->quotas->kobj);
scheme->quotas = NULL;
+put_dests_out:
+ kobject_put(&scheme->dests->kobj);
+ scheme->dests = NULL;
put_access_pattern_out:
kobject_put(&scheme->access_pattern->kobj);
scheme->access_pattern = NULL;
@@ -1642,11 +2158,17 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
{
damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
kobject_put(&scheme->access_pattern->kobj);
+ damos_sysfs_dests_rm_dirs(scheme->dests);
+ kobject_put(&scheme->dests->kobj);
damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
kobject_put(&scheme->watermarks->kobj);
damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
kobject_put(&scheme->filters->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->core_filters);
+ kobject_put(&scheme->core_filters->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->ops_filters);
+ kobject_put(&scheme->ops_filters->kobj);
kobject_put(&scheme->stats->kobj);
damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
kobject_put(&scheme->tried_regions->kobj);
@@ -1657,9 +2179,16 @@ static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_scheme *scheme = container_of(kobj,
struct damon_sysfs_scheme, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_damos_action_strs[scheme->action]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) {
+ struct damos_sysfs_action_name *action_name;
+
+ action_name = &damos_sysfs_action_names[i];
+ if (action_name->action == scheme->action)
+ return sysfs_emit(buf, "%s\n", action_name->name);
+ }
+ return -EINVAL;
}
static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1667,11 +2196,14 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_scheme *scheme = container_of(kobj,
struct damon_sysfs_scheme, kobj);
- enum damos_action action;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) {
+ struct damos_sysfs_action_name *action_name;
- for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
- if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
- scheme->action = action;
+ action_name = &damos_sysfs_action_names[i];
+ if (sysfs_streq(buf, action_name->name)) {
+ scheme->action = action_name->action;
return count;
}
}
@@ -1888,7 +2420,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
if (!memcg_path)
return -EINVAL;
- path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
if (!path)
return -ENOMEM;
@@ -1918,7 +2450,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme,
sysfs_filters->filters_arr[i];
struct damos_filter *filter =
damos_new_filter(sysfs_filter->type,
- sysfs_filter->matching);
+ sysfs_filter->matching,
+ sysfs_filter->allow);
int err;
if (!filter)
@@ -1940,6 +2473,13 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme,
filter->addr_range = sysfs_filter->addr_range;
} else if (filter->type == DAMOS_FILTER_TYPE_TARGET) {
filter->target_idx = sysfs_filter->target_idx;
+ } else if (filter->type == DAMOS_FILTER_TYPE_HUGEPAGE_SIZE) {
+ if (sysfs_filter->sz_range.min >
+ sysfs_filter->sz_range.max) {
+ damos_destroy_filter(filter);
+ return -EINVAL;
+ }
+ filter->sz_range = sysfs_filter->sz_range;
}
damos_add_filter(scheme, filter);
@@ -1965,8 +2505,17 @@ static int damos_sysfs_add_quota_score(
sysfs_goal->target_value);
if (!goal)
return -ENOMEM;
- if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+ switch (sysfs_goal->metric) {
+ case DAMOS_QUOTA_USER_INPUT:
goal->current_value = sysfs_goal->current_value;
+ break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
+ default:
+ break;
+ }
damos_add_quota_goal(quota, goal);
}
return 0;
@@ -2027,6 +2576,29 @@ void damos_sysfs_update_effective_quotas(
}
}
+static int damos_sysfs_add_migrate_dest(struct damos *scheme,
+ struct damos_sysfs_dests *sysfs_dests)
+{
+ struct damos_migrate_dests *dests = &scheme->migrate_dests;
+ int i;
+
+ dests->node_id_arr = kmalloc_array(sysfs_dests->nr,
+ sizeof(*dests->node_id_arr), GFP_KERNEL);
+ if (!dests->node_id_arr)
+ return -ENOMEM;
+ dests->weight_arr = kmalloc_array(sysfs_dests->nr,
+ sizeof(*dests->weight_arr), GFP_KERNEL);
+ if (!dests->weight_arr)
+ /* ->node_id_arr will be freed by scheme destruction */
+ return -ENOMEM;
+ for (i = 0; i < sysfs_dests->nr; i++) {
+ dests->node_id_arr[i] = sysfs_dests->dests_arr[i]->id;
+ dests->weight_arr[i] = sysfs_dests->dests_arr[i]->weight;
+ }
+ dests->nr_dests = sysfs_dests->nr;
+ return 0;
+}
+
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
@@ -2035,8 +2607,6 @@ static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
- struct damon_sysfs_scheme_filters *sysfs_filters =
- sysfs_scheme->filters;
struct damos *scheme;
int err;
@@ -2076,7 +2646,22 @@ static struct damos *damon_sysfs_mk_scheme(
return NULL;
}
- err = damon_sysfs_add_scheme_filters(scheme, sysfs_filters);
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->core_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->ops_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damos_sysfs_add_migrate_dest(scheme, sysfs_scheme->dests);
if (err) {
damon_destroy_scheme(scheme);
return NULL;
@@ -2122,32 +2707,32 @@ void damon_sysfs_schemes_update_stats(
sysfs_stats->sz_tried = scheme->stat.sz_tried;
sysfs_stats->nr_applied = scheme->stat.nr_applied;
sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->sz_ops_filter_passed =
+ scheme->stat.sz_ops_filter_passed;
sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
}
}
-/*
- * damon_sysfs_schemes that need to update its schemes regions dir. Protected
- * by damon_sysfs_lock
- */
-static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
-static int damon_sysfs_schemes_region_idx;
-static bool damos_regions_upd_total_bytes_only;
-
-/*
- * DAMON callback that called before damos apply. While this callback is
- * registered, damon_sysfs_lock should be held to ensure the regions
- * directories exist.
+/**
+ * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir.
+ * @sysfs_schemes: Schemes directory to populate regions directory.
+ * @ctx: Corresponding DAMON context.
+ * @t: DAMON target of @r.
+ * @r: DAMON region to populate the directory for.
+ * @s: Corresponding scheme.
+ * @total_bytes_only: Whether the request is for bytes update only.
+ * @sz_filter_passed: Bytes of @r that passed filters of @s.
+ *
+ * Called from DAMOS walk callback while holding damon_sysfs_lock.
*/
-static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- struct damos *s)
+void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s, bool total_bytes_only,
+ unsigned long sz_filter_passed)
{
struct damos *scheme;
struct damon_sysfs_scheme_regions *sysfs_regions;
struct damon_sysfs_scheme_region *region;
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
int schemes_idx = 0;
damon_for_each_scheme(scheme, ctx) {
@@ -2158,152 +2743,39 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
/* user could have removed the scheme sysfs dir */
if (schemes_idx >= sysfs_schemes->nr)
- return 0;
+ return;
sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
- if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED)
- return 0;
- if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE)
- sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED;
sysfs_regions->total_bytes += r->ar.end - r->ar.start;
- if (damos_regions_upd_total_bytes_only)
- return 0;
+ if (total_bytes_only)
+ return;
region = damon_sysfs_scheme_region_alloc(r);
if (!region)
- return 0;
+ return;
+ region->sz_filter_passed = sz_filter_passed;
list_add_tail(&region->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
if (kobject_init_and_add(&region->kobj,
&damon_sysfs_scheme_region_ktype,
&sysfs_regions->kobj, "%d",
- damon_sysfs_schemes_region_idx++)) {
+ sysfs_regions->nr_regions++)) {
kobject_put(&region->kobj);
}
- return 0;
}
-/*
- * DAMON callback that called after each accesses sampling. While this
- * callback is registered, damon_sysfs_lock should be held to ensure the
- * regions directories exist.
- */
-void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
+int damon_sysfs_schemes_clear_regions(
+ struct damon_sysfs_schemes *sysfs_schemes)
{
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
- struct damon_sysfs_scheme_regions *sysfs_regions;
int i;
for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- if (sysfs_regions->upd_status ==
- DAMOS_TRIED_REGIONS_UPD_STARTED ||
- time_after(jiffies,
- sysfs_regions->upd_timeout_jiffies))
- sysfs_regions->upd_status =
- DAMOS_TRIED_REGIONS_UPD_FINISHED;
- }
-}
-
-/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_clear_regions(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
-{
- struct damos *scheme;
- int schemes_idx = 0;
-
- damon_for_each_scheme(scheme, ctx) {
struct damon_sysfs_scheme *sysfs_scheme;
- /* user could have removed the scheme sysfs dir */
- if (schemes_idx >= sysfs_schemes->nr)
- break;
-
- sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+ sysfs_scheme = sysfs_schemes->schemes_arr[i];
damon_sysfs_scheme_regions_rm_dirs(
sysfs_scheme->tried_regions);
sysfs_scheme->tried_regions->total_bytes = 0;
}
return 0;
}
-
-static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx)
-{
- struct damos *scheme;
- int i = 0;
-
- damon_for_each_scheme(scheme, ctx) {
- if (i == n)
- return scheme;
- i++;
- }
- return NULL;
-}
-
-static void damos_tried_regions_init_upd_status(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
-{
- int i;
- struct damos *scheme;
- struct damon_sysfs_scheme_regions *sysfs_regions;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- scheme = damos_sysfs_nth_scheme(i, ctx);
- if (!scheme) {
- sysfs_regions->upd_status =
- DAMOS_TRIED_REGIONS_UPD_FINISHED;
- continue;
- }
- sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
- sysfs_regions->upd_timeout_jiffies = jiffies +
- 2 * usecs_to_jiffies(scheme->apply_interval_us ?
- scheme->apply_interval_us :
- ctx->attrs.aggr_interval);
- }
-}
-
-/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_update_regions_start(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx, bool total_bytes_only)
-{
- damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
- damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
- damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
- damos_regions_upd_total_bytes_only = total_bytes_only;
- ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
- return 0;
-}
-
-bool damos_sysfs_regions_upd_done(void)
-{
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
- struct damon_sysfs_scheme_regions *sysfs_regions;
- int i;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- if (sysfs_regions->upd_status !=
- DAMOS_TRIED_REGIONS_UPD_FINISHED)
- return false;
- }
- return true;
-}
-
-/*
- * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller
- * should unlock damon_sysfs_lock which held before
- * damon_sysfs_schemes_update_regions_start()
- */
-int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
-{
- damon_sysfs_schemes_for_damos_callback = NULL;
- ctx->callback.before_damos_apply = NULL;
- damon_sysfs_schemes_region_idx = 0;
- return 0;
-}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 58145d59881d..ea2feddc4b88 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -409,6 +409,164 @@ static const struct kobj_type damon_sysfs_targets_ktype = {
};
/*
+ * intervals goal directory
+ */
+
+struct damon_sysfs_intervals_goal {
+ struct kobject kobj;
+ unsigned long access_bp;
+ unsigned long aggrs;
+ unsigned long min_sample_us;
+ unsigned long max_sample_us;
+};
+
+static struct damon_sysfs_intervals_goal *damon_sysfs_intervals_goal_alloc(
+ unsigned long access_bp, unsigned long aggrs,
+ unsigned long min_sample_us, unsigned long max_sample_us)
+{
+ struct damon_sysfs_intervals_goal *goal = kmalloc(sizeof(*goal),
+ GFP_KERNEL);
+
+ if (!goal)
+ return NULL;
+
+ goal->kobj = (struct kobject){};
+ goal->access_bp = access_bp;
+ goal->aggrs = aggrs;
+ goal->min_sample_us = min_sample_us;
+ goal->max_sample_us = max_sample_us;
+ return goal;
+}
+
+static ssize_t access_bp_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->access_bp);
+}
+
+static ssize_t access_bp_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->access_bp = nr;
+ return count;
+}
+
+static ssize_t aggrs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->aggrs);
+}
+
+static ssize_t aggrs_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->aggrs = nr;
+ return count;
+}
+
+static ssize_t min_sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->min_sample_us);
+}
+
+static ssize_t min_sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->min_sample_us = nr;
+ return count;
+}
+
+static ssize_t max_sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->max_sample_us);
+}
+
+static ssize_t max_sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->max_sample_us = nr;
+ return count;
+}
+
+static void damon_sysfs_intervals_goal_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_intervals_goal, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_intervals_goal_access_bp_attr =
+ __ATTR_RW_MODE(access_bp, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_aggrs_attr =
+ __ATTR_RW_MODE(aggrs, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_min_sample_us_attr =
+ __ATTR_RW_MODE(min_sample_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_max_sample_us_attr =
+ __ATTR_RW_MODE(max_sample_us, 0600);
+
+static struct attribute *damon_sysfs_intervals_goal_attrs[] = {
+ &damon_sysfs_intervals_goal_access_bp_attr.attr,
+ &damon_sysfs_intervals_goal_aggrs_attr.attr,
+ &damon_sysfs_intervals_goal_min_sample_us_attr.attr,
+ &damon_sysfs_intervals_goal_max_sample_us_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_intervals_goal);
+
+static const struct kobj_type damon_sysfs_intervals_goal_ktype = {
+ .release = damon_sysfs_intervals_goal_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_intervals_goal_groups,
+};
+
+/*
* intervals directory
*/
@@ -417,6 +575,7 @@ struct damon_sysfs_intervals {
unsigned long sample_us;
unsigned long aggr_us;
unsigned long update_us;
+ struct damon_sysfs_intervals_goal *intervals_goal;
};
static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
@@ -436,6 +595,32 @@ static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
return intervals;
}
+static int damon_sysfs_intervals_add_dirs(struct damon_sysfs_intervals *intervals)
+{
+ struct damon_sysfs_intervals_goal *goal;
+ int err;
+
+ goal = damon_sysfs_intervals_goal_alloc(0, 0, 0, 0);
+ if (!goal)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&goal->kobj,
+ &damon_sysfs_intervals_goal_ktype, &intervals->kobj,
+ "intervals_goal");
+ if (err) {
+ kobject_put(&goal->kobj);
+ intervals->intervals_goal = NULL;
+ return err;
+ }
+ intervals->intervals_goal = goal;
+ return 0;
+}
+
+static void damon_sysfs_intervals_rm_dirs(struct damon_sysfs_intervals *intervals)
+{
+ kobject_put(&intervals->intervals_goal->kobj);
+}
+
static ssize_t sample_us_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -571,6 +756,9 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
"intervals");
if (err)
goto put_intervals_out;
+ err = damon_sysfs_intervals_add_dirs(intervals);
+ if (err)
+ goto put_intervals_out;
attrs->intervals = intervals;
nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
@@ -599,6 +787,7 @@ put_intervals_out:
static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
{
kobject_put(&attrs->nr_regions_range->kobj);
+ damon_sysfs_intervals_rm_dirs(attrs->intervals);
kobject_put(&attrs->intervals->kobj);
}
@@ -622,11 +811,24 @@ static const struct kobj_type damon_sysfs_attrs_ktype = {
* context directory
*/
-/* This should match with enum damon_ops_id */
-static const char * const damon_sysfs_ops_strs[] = {
- "vaddr",
- "fvaddr",
- "paddr",
+struct damon_sysfs_ops_name {
+ enum damon_ops_id ops_id;
+ char *name;
+};
+
+static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = {
+ {
+ .ops_id = DAMON_OPS_VADDR,
+ .name = "vaddr",
+ },
+ {
+ .ops_id = DAMON_OPS_FVADDR,
+ .name = "fvaddr",
+ },
+ {
+ .ops_id = DAMON_OPS_PADDR,
+ .name = "paddr",
+ },
};
struct damon_sysfs_context {
@@ -745,14 +947,16 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
static ssize_t avail_operations_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- enum damon_ops_id id;
int len = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
- for (id = 0; id < NR_DAMON_OPS; id++) {
- if (!damon_is_registered_ops(id))
+ ops_name = &damon_sysfs_ops_names[i];
+ if (!damon_is_registered_ops(ops_name->ops_id))
continue;
- len += sysfs_emit_at(buf, len, "%s\n",
- damon_sysfs_ops_strs[id]);
+ len += sysfs_emit_at(buf, len, "%s\n", ops_name->name);
}
return len;
}
@@ -762,8 +966,16 @@ static ssize_t operations_show(struct kobject *kobj,
{
struct damon_sysfs_context *context = container_of(kobj,
struct damon_sysfs_context, kobj);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
- return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]);
+ ops_name = &damon_sysfs_ops_names[i];
+ if (ops_name->ops_id == context->ops_id)
+ return sysfs_emit(buf, "%s\n", ops_name->name);
+ }
+ return -EINVAL;
}
static ssize_t operations_store(struct kobject *kobj,
@@ -771,11 +983,14 @@ static ssize_t operations_store(struct kobject *kobj,
{
struct damon_sysfs_context *context = container_of(kobj,
struct damon_sysfs_context, kobj);
- enum damon_ops_id id;
+ int i;
- for (id = 0; id < NR_DAMON_OPS; id++) {
- if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
- context->ops_id = id;
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
+
+ ops_name = &damon_sysfs_ops_names[i];
+ if (sysfs_streq(buf, ops_name->name)) {
+ context->ops_id = ops_name->ops_id;
return count;
}
}
@@ -940,6 +1155,7 @@ struct damon_sysfs_kdamond {
struct kobject kobj;
struct damon_sysfs_contexts *contexts;
struct damon_ctx *damon_ctx;
+ unsigned int refresh_ms;
};
static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void)
@@ -974,16 +1190,6 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
kobject_put(&kdamond->contexts->kobj);
}
-static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
-{
- bool running;
-
- mutex_lock(&ctx->kdamond_lock);
- running = ctx->kdamond != NULL;
- mutex_unlock(&ctx->kdamond_lock);
- return running;
-}
-
/*
* enum damon_sysfs_cmd - Commands for a specific kdamond.
*/
@@ -1025,6 +1231,11 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring
+ * intevals.
+ */
+ DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS,
+ /*
* @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
*/
NR_DAMON_SYSFS_CMDS,
@@ -1041,39 +1252,25 @@ static const char * const damon_sysfs_cmd_strs[] = {
"update_schemes_tried_regions",
"clear_schemes_tried_regions",
"update_schemes_effective_quotas",
+ "update_tuned_intervals",
};
-/*
- * struct damon_sysfs_cmd_request - A request to the DAMON callback.
- * @cmd: The command that needs to be handled by the callback.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
- *
- * This structure represents a sysfs command request that need to access some
- * DAMON context-internal data. Because DAMON context-internal data can be
- * safely accessed from DAMON callbacks without additional synchronization, the
- * request will be handled by the DAMON callback. None-``NULL`` @kdamond means
- * the request is valid.
- */
-struct damon_sysfs_cmd_request {
- enum damon_sysfs_cmd cmd;
- struct damon_sysfs_kdamond *kdamond;
-};
-
-/* Current DAMON callback request. Protected by damon_sysfs_lock. */
-static struct damon_sysfs_cmd_request damon_sysfs_cmd_request;
-
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
struct damon_sysfs_kdamond, kobj);
- struct damon_ctx *ctx = kdamond->damon_ctx;
- bool running;
+ struct damon_ctx *ctx;
+ bool running = false;
- if (!ctx)
- running = false;
- else
- running = damon_sysfs_ctx_running(ctx);
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+
+ ctx = kdamond->damon_ctx;
+ if (ctx)
+ running = damon_is_running(ctx);
+
+ mutex_unlock(&damon_sysfs_lock);
return sysfs_emit(buf, "%s\n", running ?
damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
@@ -1084,11 +1281,18 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
struct damon_sysfs_attrs *sys_attrs)
{
struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
+ struct damon_sysfs_intervals_goal *sys_goal =
+ sys_intervals->intervals_goal;
struct damon_sysfs_ul_range *sys_nr_regions =
sys_attrs->nr_regions_range;
struct damon_attrs attrs = {
.sample_interval = sys_intervals->sample_us,
.aggr_interval = sys_intervals->aggr_us,
+ .intervals_goal = {
+ .access_bp = sys_goal->access_bp,
+ .aggrs = sys_goal->aggrs,
+ .min_sample_us = sys_goal->min_sample_us,
+ .max_sample_us = sys_goal->max_sample_us},
.ops_update_interval = sys_intervals->update_us,
.min_nr_regions = sys_nr_regions->min,
.max_nr_regions = sys_nr_regions->max,
@@ -1096,18 +1300,6 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
return damon_set_attrs(ctx, &attrs);
}
-static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
- bool has_pid = damon_target_has_pid(ctx);
-
- damon_for_each_target_safe(t, next, ctx) {
- if (has_pid)
- put_pid(t->pid);
- damon_destroy_target(t);
- }
-}
-
static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_regions *sysfs_regions)
{
@@ -1142,7 +1334,6 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
struct damon_ctx *ctx)
{
struct damon_target *t = damon_new_target();
- int err = -EINVAL;
if (!t)
return -ENOMEM;
@@ -1150,16 +1341,10 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
- goto destroy_targets_out;
+ /* caller will destroy targets */
+ return -EINVAL;
}
- err = damon_sysfs_set_regions(t, sys_target->regions);
- if (err)
- goto destroy_targets_out;
- return 0;
-
-destroy_targets_out:
- damon_sysfs_destroy_targets(ctx);
- return err;
+ return damon_sysfs_set_regions(t, sys_target->regions);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1181,95 +1366,31 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx,
return 0;
}
-static bool damon_sysfs_schemes_regions_updating;
-
-static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
- struct damon_sysfs_kdamond *kdamond;
- enum damon_sysfs_cmd cmd;
-
- /* damon_sysfs_schemes_update_regions_stop() might not yet called */
- kdamond = damon_sysfs_cmd_request.kdamond;
- cmd = damon_sysfs_cmd_request.cmd;
- if (kdamond && ctx == kdamond->damon_ctx &&
- (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
- cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) &&
- damon_sysfs_schemes_regions_updating) {
- damon_sysfs_schemes_update_regions_stop(ctx);
- damon_sysfs_schemes_regions_updating = false;
- mutex_unlock(&damon_sysfs_lock);
- }
-
- if (!damon_target_has_pid(ctx))
- return;
-
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_target_safe(t, next, ctx) {
- put_pid(t->pid);
- damon_destroy_target(t);
- }
- mutex_unlock(&ctx->kdamond_lock);
-}
-
/*
* damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ * @data: The kobject wrapper that associated to the kdamond thread.
*
* This function reads the schemes stats of specific kdamond and update the
* related values for sysfs files. This function should be called from DAMON
- * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
- * contexts-internal data and DAMON sysfs variables.
+ * worker thread,to safely access the DAMON contexts-internal data. Caller
+ * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is
+ * not NULL but a valid pointer, to safely access DAMON sysfs variables.
*/
-static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_upd_schemes_stats(void *data)
{
+ struct damon_sysfs_kdamond *kdamond = data;
struct damon_ctx *ctx = kdamond->damon_ctx;
- if (!ctx)
- return -EINVAL;
damon_sysfs_schemes_update_stats(
kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
-static int damon_sysfs_upd_schemes_regions_start(
- struct damon_sysfs_kdamond *kdamond, bool total_bytes_only)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_update_regions_start(
- kdamond->contexts->contexts_arr[0]->schemes, ctx,
- total_bytes_only);
-}
-
-static int damon_sysfs_upd_schemes_regions_stop(
- struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_update_regions_stop(ctx);
-}
-
-static int damon_sysfs_clear_schemes_regions(
- struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_clear_regions(
- kdamond->contexts->contexts_arr[0]->schemes, ctx);
-}
-
static inline bool damon_sysfs_kdamond_running(
struct damon_sysfs_kdamond *kdamond)
{
return kdamond->damon_ctx &&
- damon_sysfs_ctx_running(kdamond->damon_ctx);
+ damon_is_running(kdamond->damon_ctx);
}
static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
@@ -1296,11 +1417,12 @@ static struct damon_ctx *damon_sysfs_build_ctx(
* damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
* @kdamond: The kobject wrapper for the associated kdamond.
*
- * If the sysfs input is wrong, the kdamond will be terminated.
+ * Returns error if the sysfs input is wrong.
*/
-static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_commit_input(void *data)
{
- struct damon_ctx *param_ctx;
+ struct damon_sysfs_kdamond *kdamond = data;
+ struct damon_ctx *param_ctx, *test_ctx;
int err;
if (!damon_sysfs_kdamond_running(kdamond))
@@ -1312,15 +1434,22 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
if (IS_ERR(param_ctx))
return PTR_ERR(param_ctx);
+ test_ctx = damon_new_ctx();
+ if (!test_ctx)
+ return -ENOMEM;
+ err = damon_commit_ctx(test_ctx, param_ctx);
+ if (err)
+ goto out;
err = damon_commit_ctx(kdamond->damon_ctx, param_ctx);
- damon_sysfs_destroy_targets(param_ctx);
+out:
+ damon_destroy_ctx(test_ctx);
damon_destroy_ctx(param_ctx);
return err;
}
-static int damon_sysfs_commit_schemes_quota_goals(
- struct damon_sysfs_kdamond *sysfs_kdamond)
+static int damon_sysfs_commit_schemes_quota_goals(void *data)
{
+ struct damon_sysfs_kdamond *sysfs_kdamond = data;
struct damon_ctx *ctx;
struct damon_sysfs_context *sysfs_ctx;
@@ -1338,128 +1467,33 @@ static int damon_sysfs_commit_schemes_quota_goals(
/*
* damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas
* sysfs files.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ * @data: The kobject wrapper that associated to the kdamond thread.
*
* This function reads the schemes' effective quotas of specific kdamond and
* update the related values for sysfs files. This function should be called
* from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the
* DAMON contexts-internal data and DAMON sysfs variables.
*/
-static int damon_sysfs_upd_schemes_effective_quotas(
- struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_upd_schemes_effective_quotas(void *data)
{
+ struct damon_sysfs_kdamond *kdamond = data;
struct damon_ctx *ctx = kdamond->damon_ctx;
- if (!ctx)
- return -EINVAL;
damos_sysfs_update_effective_quotas(
kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
-
-/*
- * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
- * @c: The DAMON context of the callback.
- * @active: Whether @c is not deactivated due to watermarks.
- * @after_aggr: Whether this is called from after_aggregation() callback.
- *
- * This function is periodically called back from the kdamond thread for @c.
- * Then, it checks if there is a waiting DAMON sysfs request and handles it.
- */
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
- bool after_aggregation)
-{
- struct damon_sysfs_kdamond *kdamond;
- bool total_bytes_only = false;
- int err = 0;
-
- /* avoid deadlock due to concurrent state_store('off') */
- if (!damon_sysfs_schemes_regions_updating &&
- !mutex_trylock(&damon_sysfs_lock))
- return 0;
- kdamond = damon_sysfs_cmd_request.kdamond;
- if (!kdamond || kdamond->damon_ctx != c)
- goto out;
- switch (damon_sysfs_cmd_request.cmd) {
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
- err = damon_sysfs_upd_schemes_stats(kdamond);
- break;
- case DAMON_SYSFS_CMD_COMMIT:
- if (!after_aggregation)
- goto out;
- err = damon_sysfs_commit_input(kdamond);
- break;
- case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
- err = damon_sysfs_commit_schemes_quota_goals(kdamond);
- break;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
- total_bytes_only = true;
- fallthrough;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
- if (!damon_sysfs_schemes_regions_updating) {
- err = damon_sysfs_upd_schemes_regions_start(kdamond,
- total_bytes_only);
- if (!err) {
- damon_sysfs_schemes_regions_updating = true;
- goto keep_lock_out;
- }
- } else {
- damos_sysfs_mark_finished_regions_updates(c);
- /*
- * Continue regions updating if DAMON is till
- * active and the update for all schemes is not
- * finished.
- */
- if (active && !damos_sysfs_regions_upd_done())
- goto keep_lock_out;
- err = damon_sysfs_upd_schemes_regions_stop(kdamond);
- damon_sysfs_schemes_regions_updating = false;
- }
- break;
- case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
- err = damon_sysfs_clear_schemes_regions(kdamond);
- break;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
- err = damon_sysfs_upd_schemes_effective_quotas(kdamond);
- break;
- default:
- break;
- }
- /* Mark the request as invalid now. */
- damon_sysfs_cmd_request.kdamond = NULL;
-out:
- if (!damon_sysfs_schemes_regions_updating)
- mutex_unlock(&damon_sysfs_lock);
-keep_lock_out:
- return err;
-}
-
-static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
-{
- /*
- * after_wmarks_check() is called back while the context is deactivated
- * by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, false, false);
-}
-
-static int damon_sysfs_after_sampling(struct damon_ctx *c)
+static int damon_sysfs_upd_tuned_intervals(void *data)
{
- /*
- * after_sampling() is called back only while the context is not
- * deactivated by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, true, false);
-}
+ struct damon_sysfs_kdamond *kdamond = data;
+ struct damon_ctx *ctx = kdamond->damon_ctx;
-static int damon_sysfs_after_aggregation(struct damon_ctx *c)
-{
- /*
- * after_aggregation() is called back only while the context is not
- * deactivated by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, true, true);
+ kdamond->contexts->contexts_arr[0]->attrs->intervals->sample_us =
+ ctx->attrs.sample_interval;
+ kdamond->contexts->contexts_arr[0]->attrs->intervals->aggr_us =
+ ctx->attrs.aggr_interval;
+ return 0;
}
static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1477,22 +1511,39 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ERR_PTR(err);
}
- ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
- ctx->callback.after_sampling = damon_sysfs_after_sampling;
- ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
- ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
}
+static unsigned long damon_sysfs_next_update_jiffies;
+
+static int damon_sysfs_repeat_call_fn(void *data)
+{
+ struct damon_sysfs_kdamond *sysfs_kdamond = data;
+
+ if (!sysfs_kdamond->refresh_ms)
+ return 0;
+ if (time_before(jiffies, damon_sysfs_next_update_jiffies))
+ return 0;
+ damon_sysfs_next_update_jiffies = jiffies +
+ msecs_to_jiffies(sysfs_kdamond->refresh_ms);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return 0;
+ damon_sysfs_upd_tuned_intervals(sysfs_kdamond);
+ damon_sysfs_upd_schemes_stats(sysfs_kdamond);
+ damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond);
+ mutex_unlock(&damon_sysfs_lock);
+ return 0;
+}
+
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx;
+ struct damon_call_control *repeat_call_control;
int err;
if (damon_sysfs_kdamond_running(kdamond))
return -EBUSY;
- if (damon_sysfs_cmd_request.kdamond == kdamond)
- return -EBUSY;
/* TODO: support multiple contexts per kdamond */
if (kdamond->contexts->nr != 1)
return -EINVAL;
@@ -1501,15 +1552,32 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
damon_destroy_ctx(kdamond->damon_ctx);
kdamond->damon_ctx = NULL;
+ repeat_call_control = kmalloc(sizeof(*repeat_call_control),
+ GFP_KERNEL);
+ if (!repeat_call_control)
+ return -ENOMEM;
+
ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
- if (IS_ERR(ctx))
+ if (IS_ERR(ctx)) {
+ kfree(repeat_call_control);
return PTR_ERR(ctx);
+ }
err = damon_start(&ctx, 1, false);
if (err) {
+ kfree(repeat_call_control);
damon_destroy_ctx(ctx);
return err;
}
kdamond->damon_ctx = ctx;
+
+ damon_sysfs_next_update_jiffies =
+ jiffies + msecs_to_jiffies(kdamond->refresh_ms);
+
+ repeat_call_control->fn = damon_sysfs_repeat_call_fn;
+ repeat_call_control->data = kdamond;
+ repeat_call_control->repeat = true;
+ repeat_call_control->dealloc_on_cancel = true;
+ damon_call(ctx, repeat_call_control);
return err;
}
@@ -1525,63 +1593,104 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
*/
}
+static int damon_sysfs_damon_call(int (*fn)(void *data),
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_call_control call_control = {};
+ int err;
+
+ if (!kdamond->damon_ctx)
+ return -EINVAL;
+ call_control.fn = fn;
+ call_control.data = kdamond;
+ err = damon_call(kdamond->damon_ctx, &call_control);
+ return err ? err : call_control.return_code;
+}
+
+struct damon_sysfs_schemes_walk_data {
+ struct damon_sysfs_kdamond *sysfs_kdamond;
+ bool total_bytes_only;
+};
+
+/* populate the region directory */
+static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *s, unsigned long sz_filter_passed)
+{
+ struct damon_sysfs_schemes_walk_data *walk_data = data;
+ struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond;
+
+ damos_sysfs_populate_region_dir(
+ sysfs_kdamond->contexts->contexts_arr[0]->schemes,
+ ctx, t, r, s, walk_data->total_bytes_only,
+ sz_filter_passed);
+}
+
+static int damon_sysfs_update_schemes_tried_regions(
+ struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only)
+{
+ struct damon_sysfs_schemes_walk_data walk_data = {
+ .sysfs_kdamond = sysfs_kdamond,
+ .total_bytes_only = total_bytes_only,
+ };
+ struct damos_walk_control control = {
+ .walk_fn = damon_sysfs_schemes_tried_regions_upd_one,
+ .data = &walk_data,
+ };
+ struct damon_ctx *ctx = sysfs_kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+
+ damon_sysfs_schemes_clear_regions(
+ sysfs_kdamond->contexts->contexts_arr[0]->schemes);
+ return damos_walk(ctx, &control);
+}
+
/*
* damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
* @cmd: The command to handle.
* @kdamond: The kobject wrapper for the associated kdamond.
*
- * This function handles a DAMON sysfs command for a kdamond. For commands
- * that need to access running DAMON context-internal data, it requests
- * handling of the command to the DAMON callback
- * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled,
- * or the context is completed.
+ * This function handles a DAMON sysfs command for a kdamond.
*
* Return: 0 on success, negative error code otherwise.
*/
static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
struct damon_sysfs_kdamond *kdamond)
{
- bool need_wait = true;
-
- /* Handle commands that doesn't access DAMON context-internal data */
switch (cmd) {
case DAMON_SYSFS_CMD_ON:
return damon_sysfs_turn_damon_on(kdamond);
case DAMON_SYSFS_CMD_OFF:
return damon_sysfs_turn_damon_off(kdamond);
+ case DAMON_SYSFS_CMD_COMMIT:
+ return damon_sysfs_damon_call(
+ damon_sysfs_commit_input, kdamond);
+ case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_commit_schemes_quota_goals,
+ kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_schemes_stats, kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
+ return damon_sysfs_update_schemes_tried_regions(kdamond, true);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+ return damon_sysfs_update_schemes_tried_regions(kdamond, false);
+ case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+ return damon_sysfs_schemes_clear_regions(
+ kdamond->contexts->contexts_arr[0]->schemes);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_schemes_effective_quotas,
+ kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_tuned_intervals, kdamond);
default:
- break;
- }
-
- /* Pass the command to DAMON callback for safe DAMON context access */
- if (damon_sysfs_cmd_request.kdamond)
- return -EBUSY;
- if (!damon_sysfs_kdamond_running(kdamond))
return -EINVAL;
- damon_sysfs_cmd_request.cmd = cmd;
- damon_sysfs_cmd_request.kdamond = kdamond;
-
- /*
- * wait until damon_sysfs_cmd_request_callback() handles the request
- * from kdamond context
- */
- mutex_unlock(&damon_sysfs_lock);
- while (need_wait) {
- schedule_timeout_idle(msecs_to_jiffies(100));
- if (!mutex_trylock(&damon_sysfs_lock))
- continue;
- if (!damon_sysfs_cmd_request.kdamond) {
- /* damon_sysfs_cmd_request_callback() handled */
- need_wait = false;
- } else if (!damon_sysfs_kdamond_running(kdamond)) {
- /* kdamond has already finished */
- need_wait = false;
- damon_sysfs_cmd_request.kdamond = NULL;
- }
- mutex_unlock(&damon_sysfs_lock);
}
- mutex_lock(&damon_sysfs_lock);
- return 0;
}
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1629,6 +1738,30 @@ out:
return sysfs_emit(buf, "%d\n", pid);
}
+static ssize_t refresh_ms_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+
+ return sysfs_emit(buf, "%u\n", kdamond->refresh_ms);
+}
+
+static ssize_t refresh_ms_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ unsigned int nr;
+ int err = kstrtouint(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ kdamond->refresh_ms = nr;
+ return count;
+}
+
static void damon_sysfs_kdamond_release(struct kobject *kobj)
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
@@ -1645,9 +1778,13 @@ static struct kobj_attribute damon_sysfs_kdamond_state_attr =
static struct kobj_attribute damon_sysfs_kdamond_pid_attr =
__ATTR_RO_MODE(pid, 0400);
+static struct kobj_attribute damon_sysfs_kdamond_refresh_ms_attr =
+ __ATTR_RW_MODE(refresh_ms, 0600);
+
static struct attribute *damon_sysfs_kdamond_attrs[] = {
&damon_sysfs_kdamond_state_attr.attr,
&damon_sysfs_kdamond_pid_attr.attr,
+ &damon_sysfs_kdamond_refresh_ms_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
@@ -1693,8 +1830,7 @@ static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
int i;
for (i = 0; i < nr_kdamonds; i++) {
- if (damon_sysfs_kdamond_running(kdamonds[i]) ||
- damon_sysfs_cmd_request.kdamond == kdamonds[i])
+ if (damon_sysfs_kdamond_running(kdamonds[i]))
return true;
}
diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig
index a73be044fc9b..36a450f57b58 100644
--- a/mm/damon/tests/.kunitconfig
+++ b/mm/damon/tests/.kunitconfig
@@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y
CONFIG_SYSFS=y
CONFIG_DAMON_SYSFS=y
CONFIG_DAMON_SYSFS_KUNIT_TEST=y
-
-# for DAMON debugfs interface
-CONFIG_DEBUG_FS=y
-CONFIG_DAMON_PADDR=y
-CONFIG_DAMON_DBGFS_DEPRECATED=y
-CONFIG_DAMON_DBGFS=y
-CONFIG_DAMON_DBGFS_KUNIT_TEST=y
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index cf22e09a3507..dfedfff19940 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -58,7 +58,7 @@ static void damon_test_target(struct kunit *test)
damon_add_target(c, t);
KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c));
- damon_destroy_target(t);
+ damon_destroy_target(t, c);
KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
damon_destroy_ctx(c);
@@ -310,7 +310,7 @@ static void damon_test_set_regions(struct kunit *test)
KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
}
- damon_destroy_target(t);
+ damon_destroy_target(t, NULL);
}
static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test)
@@ -348,19 +348,19 @@ static void damon_test_update_monitoring_result(struct kunit *test)
new_attrs = (struct damon_attrs){
.sample_interval = 100, .aggr_interval = 10000,};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 15);
KUNIT_EXPECT_EQ(test, r->age, 2);
new_attrs = (struct damon_attrs){
.sample_interval = 1, .aggr_interval = 1000};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 2);
new_attrs = (struct damon_attrs){
.sample_interval = 1, .aggr_interval = 100};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 20);
@@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test)
{
struct damos_filter *filter;
- filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
KUNIT_EXPECT_EQ(test, filter->matching, true);
KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
@@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test)
struct damon_region *r, *r2;
struct damos_filter *f;
- f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true);
+ f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false);
f->addr_range = (struct damon_addr_range){
.start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6};
@@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
@@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test)
damon_feed_loop_next_input(last_input, 2000));
}
+static void damon_test_set_filters_default_reject(struct kunit *test)
+{
+ struct damos scheme;
+ struct damos_filter *target_filter, *anon_filter;
+
+ INIT_LIST_HEAD(&scheme.filters);
+ INIT_LIST_HEAD(&scheme.ops_filters);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * No filter is installed. Allow by default on both core and ops layer
+ * filtering stages, since there are no filters at all.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true);
+ damos_add_filter(&scheme, target_filter);
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter is installed.
+ * Rejct by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter->allow = false;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter is installed.
+ * Allow by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true);
+ damos_add_filter(&scheme, anon_filter);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter and ops-handled allow-filter are installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+
+ target_filter->allow = true;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter and ops-handled allow-filter are
+ * installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damos_test_new_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
+ KUNIT_CASE(damon_test_set_filters_default_reject),
{},
};
diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h
deleted file mode 100644
index 087e53f641a8..000000000000
--- a/mm/damon/tests/dbgfs-kunit.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * DAMON Debugfs Interface Unit Tests
- *
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST
-
-#ifndef _DAMON_DBGFS_TEST_H
-#define _DAMON_DBGFS_TEST_H
-
-#include <kunit/test.h>
-
-static void damon_dbgfs_test_str_to_ints(struct kunit *test)
-{
- char *question;
- int *answers;
- int expected[] = {12, 35, 46};
- ssize_t nr_integers = 0, i;
-
- question = "123";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123, answers[0]);
- kfree(answers);
-
- question = "123abc";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123, answers[0]);
- kfree(answers);
-
- question = "a123";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-
- question = "12 35";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
- for (i = 0; i < nr_integers; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "12 35 46";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
- for (i = 0; i < nr_integers; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "12 35 abc 46";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
- for (i = 0; i < 2; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-
- question = "\n";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-}
-
-static void damon_dbgfs_test_set_targets(struct kunit *test)
-{
- struct damon_ctx *ctx = dbgfs_new_ctx();
- char buf[64];
-
- if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
- dbgfs_destroy_ctx(ctx);
- kunit_skip(test, "PADDR not registered");
- }
-
- /* Make DAMON consider target has no pid */
- damon_select_ops(ctx, DAMON_OPS_PADDR);
-
- dbgfs_set_targets(ctx, 0, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
-
- dbgfs_set_targets(ctx, 1, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
-
- dbgfs_set_targets(ctx, 0, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
-
- dbgfs_destroy_ctx(ctx);
-}
-
-static void damon_dbgfs_test_set_init_regions(struct kunit *test)
-{
- struct damon_ctx *ctx = damon_new_ctx();
- /* Each line represents one region in ``<target idx> <start> <end>`` */
- char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45",
- "1 10 20\n",
- "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n",
- ""};
- /* Reading the file again will show sorted, clean output */
- char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
- "1 10 20\n",
- "0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
- ""};
- char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */
- "1 10 20\n 1 14 26\n", /* regions overlap */
- "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */
- char *input, *expect;
- int i, rc;
- char buf[256];
-
- if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- kunit_skip(test, "PADDR not registered");
- }
-
- damon_select_ops(ctx, DAMON_OPS_PADDR);
-
- dbgfs_set_targets(ctx, 3, NULL);
-
- /* Put valid inputs and check the results */
- for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
- input = valid_inputs[i];
- expect = valid_expects[i];
-
- rc = set_init_regions(ctx, input, strnlen(input, 256));
- KUNIT_EXPECT_EQ(test, rc, 0);
-
- memset(buf, 0, 256);
- sprint_init_regions(ctx, buf, 256);
-
- KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
- }
- /* Put invalid inputs and check the return error code */
- for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
- input = invalid_inputs[i];
- pr_info("input: %s\n", input);
- rc = set_init_regions(ctx, input, strnlen(input, 256));
- KUNIT_EXPECT_EQ(test, rc, -EINVAL);
-
- memset(buf, 0, 256);
- sprint_init_regions(ctx, buf, 256);
-
- KUNIT_EXPECT_STREQ(test, (char *)buf, "");
- }
-
- dbgfs_set_targets(ctx, 0, NULL);
- damon_destroy_ctx(ctx);
-}
-
-static struct kunit_case damon_test_cases[] = {
- KUNIT_CASE(damon_dbgfs_test_str_to_ints),
- KUNIT_CASE(damon_dbgfs_test_set_targets),
- KUNIT_CASE(damon_dbgfs_test_set_init_regions),
- {},
-};
-
-static struct kunit_suite damon_test_suite = {
- .name = "damon-dbgfs",
- .test_cases = damon_test_cases,
-};
-kunit_test_suite(damon_test_suite);
-
-#endif /* _DAMON_DBGFS_TEST_H */
-
-#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index b9fe3bc8472b..d2b37ccf2cc0 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
static struct mm_struct mm;
struct damon_addr_range regions[3] = {0};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
- struct vm_area_struct vmas[] = {
+ static struct vm_area_struct vmas[] = {
(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
(struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
(struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
@@ -149,7 +149,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
}
- damon_destroy_target(t);
+ damon_destroy_target(t, NULL);
}
/*
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b9eaa20b73b9..1172390b76b4 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for Virtual Address Spaces
+ * DAMON Code for Virtual Address Spaces
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -15,6 +15,7 @@
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
+#include "../internal.h"
#include "ops-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
@@ -327,10 +328,8 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
}
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
+ if (!pte)
return 0;
- }
if (!pte_present(ptep_get(pte)))
goto out;
damon_ptep_mkold(pte, walk->vma, addr);
@@ -480,10 +479,8 @@ regular_page:
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
+ if (!pte)
return 0;
- }
ptent = ptep_get(pte);
if (!pte_present(ptent))
goto out;
@@ -610,6 +607,187 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
+static bool damos_va_filter_young_match(struct damos_filter *filter,
+ struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pmd_t *pmdp)
+{
+ bool young = false;
+
+ if (ptep)
+ young = pte_young(ptep_get(ptep));
+ else if (pmdp)
+ young = pmd_young(pmdp_get(pmdp));
+
+ young = young || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+
+ if (young && ptep)
+ damon_ptep_mkold(ptep, vma, addr);
+ else if (young && pmdp)
+ damon_pmdp_mkold(pmdp, vma, addr);
+
+ return young == filter->matching;
+}
+
+static bool damos_va_filter_out(struct damos *scheme, struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pmd_t *pmdp)
+{
+ struct damos_filter *filter;
+ bool matched;
+
+ if (scheme->core_filters_allowed)
+ return false;
+
+ damos_for_each_ops_filter(filter, scheme) {
+ /*
+ * damos_folio_filter_match checks the young filter by doing an
+ * rmap on the folio to find its page table. However, being the
+ * vaddr scheme, we have direct access to the page tables, so
+ * use that instead.
+ */
+ if (filter->type == DAMOS_FILTER_TYPE_YOUNG)
+ matched = damos_va_filter_young_match(filter, folio,
+ vma, addr, ptep, pmdp);
+ else
+ matched = damos_folio_filter_match(filter, folio);
+
+ if (matched)
+ return !filter->allow;
+ }
+ return scheme->ops_filters_default_reject;
+}
+
+struct damos_va_migrate_private {
+ struct list_head *migration_lists;
+ struct damos *scheme;
+};
+
+/*
+ * Place the given folio in the migration_list corresponding to where the folio
+ * should be migrated.
+ *
+ * The algorithm used here is similar to weighted_interleave_nid()
+ */
+static void damos_va_migrate_dests_add(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr,
+ struct damos_migrate_dests *dests,
+ struct list_head *migration_lists)
+{
+ pgoff_t ilx;
+ int order;
+ unsigned int target;
+ unsigned int weight_total = 0;
+ int i;
+
+ /*
+ * If dests is empty, there is only one migration list corresponding
+ * to s->target_nid.
+ */
+ if (!dests->nr_dests) {
+ i = 0;
+ goto isolate;
+ }
+
+ order = folio_order(folio);
+ ilx = vma->vm_pgoff >> order;
+ ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
+
+ for (i = 0; i < dests->nr_dests; i++)
+ weight_total += dests->weight_arr[i];
+
+ /* If the total weights are somehow 0, don't migrate at all */
+ if (!weight_total)
+ return;
+
+ target = ilx % weight_total;
+ for (i = 0; i < dests->nr_dests; i++) {
+ if (target < dests->weight_arr[i])
+ break;
+ target -= dests->weight_arr[i];
+ }
+
+ /* If the folio is already in the right node, don't do anything */
+ if (folio_nid(folio) == dests->node_id_arr[i])
+ return;
+
+isolate:
+ if (!folio_isolate_lru(folio))
+ return;
+
+ list_add(&folio->lru, &migration_lists[i]);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_migrate_private *priv = walk->private;
+ struct list_head *migration_lists = priv->migration_lists;
+ struct damos *s = priv->scheme;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pmd_t pmde;
+
+ ptl = pmd_lock(walk->mm, pmd);
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde) || !pmd_trans_huge(pmde))
+ goto unlock;
+
+ /* Tell page walk code to not split the PMD */
+ walk->action = ACTION_CONTINUE;
+
+ folio = damon_get_folio(pmd_pfn(pmde));
+ if (!folio)
+ goto unlock;
+
+ if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd))
+ goto put_folio;
+
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+
+put_folio:
+ folio_put(folio);
+unlock:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damos_va_migrate_pmd_entry NULL
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_migrate_private *priv = walk->private;
+ struct list_head *migration_lists = priv->migration_lists;
+ struct damos *s = priv->scheme;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct folio *folio;
+ pte_t ptent;
+
+ ptent = ptep_get(pte);
+ if (pte_none(ptent) || !pte_present(ptent))
+ return 0;
+
+ folio = damon_get_folio(pte_pfn(ptent));
+ if (!folio)
+ return 0;
+
+ if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
+ goto put_folio;
+
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+
+put_folio:
+ folio_put(folio);
+ return 0;
+}
+
/*
* Functions for the target validity check and cleanup
*/
@@ -627,6 +805,11 @@ static bool damon_va_target_valid(struct damon_target *t)
return false;
}
+static void damon_va_cleanup_target(struct damon_target *t)
+{
+ put_pid(t->pid);
+}
+
#ifndef CONFIG_ADVISE_SYSCALLS
static unsigned long damos_madvise(struct damon_target *target,
struct damon_region *r, int behavior)
@@ -653,9 +836,59 @@ static unsigned long damos_madvise(struct damon_target *target,
}
#endif /* CONFIG_ADVISE_SYSCALLS */
+static unsigned long damos_va_migrate(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ LIST_HEAD(folio_list);
+ struct damos_va_migrate_private priv;
+ struct mm_struct *mm;
+ int nr_dests;
+ int nid;
+ bool use_target_nid;
+ unsigned long applied = 0;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_migrate_pmd_entry,
+ .pte_entry = damos_va_migrate_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ use_target_nid = dests->nr_dests == 0;
+ nr_dests = use_target_nid ? 1 : dests->nr_dests;
+ priv.scheme = s;
+ priv.migration_lists = kmalloc_array(nr_dests,
+ sizeof(*priv.migration_lists), GFP_KERNEL);
+ if (!priv.migration_lists)
+ return 0;
+
+ for (int i = 0; i < nr_dests; i++)
+ INIT_LIST_HEAD(&priv.migration_lists[i]);
+
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ goto free_lists;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ for (int i = 0; i < nr_dests; i++) {
+ nid = use_target_nid ? s->target_nid : dests->node_id_arr[i];
+ applied += damon_migrate_pages(&priv.migration_lists[i], nid);
+ cond_resched();
+ }
+
+free_lists:
+ kfree(priv.migration_lists);
+ return applied * PAGE_SIZE;
+}
+
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+ struct damos *scheme, unsigned long *sz_filter_passed)
{
int madv_action;
@@ -675,6 +908,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
+ case DAMOS_MIGRATE_HOT:
+ case DAMOS_MIGRATE_COLD:
+ return damos_va_migrate(t, r, scheme, sz_filter_passed);
case DAMOS_STAT:
return 0;
default:
@@ -695,6 +931,10 @@ static int damon_va_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
return damon_cold_score(context, r, scheme);
+ case DAMOS_MIGRATE_HOT:
+ return damon_hot_score(context, r, scheme);
+ case DAMOS_MIGRATE_COLD:
+ return damon_cold_score(context, r, scheme);
default:
break;
}
@@ -710,8 +950,8 @@ static int __init damon_va_initcall(void)
.update = damon_va_update,
.prepare_access_checks = damon_va_prepare_access_checks,
.check_accesses = damon_va_check_accesses,
- .reset_aggregated = NULL,
.target_valid = damon_va_target_valid,
+ .cleanup_target = damon_va_cleanup_target,
.cleanup = NULL,
.apply_scheme = damon_va_apply_scheme,
.get_scheme_score = damon_va_scheme_score,
diff --git a/mm/debug.c b/mm/debug.c
index 95b6ab809c0e..b4388f4dcd4d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -71,20 +71,27 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = atomic_read(&page->_mapcount);
+ int mapcount = atomic_read(&page->_mapcount) + 1;
char *type = "";
- mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
if (folio_test_large(folio)) {
+ int pincount = 0;
+
+ if (folio_has_pincount(folio))
+ pincount = atomic_read(&folio->_pincount);
+
pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
folio_order(folio),
folio_mapcount(folio),
folio_entire_mapcount(folio),
folio_nr_pages_mapped(folio),
- atomic_read(&folio->_pincount));
+ pincount);
}
#ifdef CONFIG_MEMCG
@@ -122,50 +129,19 @@ static void __dump_folio(struct folio *folio, struct page *page,
static void __dump_page(const struct page *page)
{
- struct folio *foliop, folio;
- struct page precise;
- unsigned long head;
- unsigned long pfn = page_to_pfn(page);
- unsigned long idx, nr_pages = 1;
- int loops = 5;
-
-again:
- memcpy(&precise, page, sizeof(*page));
- head = precise.compound_head;
- if ((head & 1) == 0) {
- foliop = (struct folio *)&precise;
- idx = 0;
- if (!folio_test_large(foliop))
- goto dump;
- foliop = (struct folio *)page;
- } else {
- foliop = (struct folio *)(head - 1);
- idx = folio_page_idx(foliop, page);
- }
+ struct page_snapshot ps;
- if (idx < MAX_FOLIO_NR_PAGES) {
- memcpy(&folio, foliop, 2 * sizeof(struct page));
- nr_pages = folio_nr_pages(&folio);
- foliop = &folio;
- }
-
- if (idx > nr_pages) {
- if (loops-- > 0)
- goto again;
+ snapshot_page(&ps, page);
+ if (!snapshot_page_is_faithful(&ps))
pr_warn("page does not match folio\n");
- precise.compound_head &= ~1UL;
- foliop = (struct folio *)&precise;
- idx = 0;
- }
-dump:
- __dump_folio(foliop, &precise, pfn, idx);
+ __dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx);
}
void dump_page(const struct page *page, const char *reason)
{
if (PagePoisoned(page))
- pr_warn("page:%p is uninitialized and poisoned", page);
+ pr_warn("page:%p is uninitialized and poisoned\n", page);
else
__dump_page(page);
if (reason)
@@ -181,11 +157,17 @@ void dump_vma(const struct vm_area_struct *vma)
pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
+#ifdef CONFIG_PER_VMA_LOCK
+ "refcnt %x\n"
+#endif
"flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
+#ifdef CONFIG_PER_VMA_LOCK
+ refcount_read(&vma->vm_refcnt),
+#endif
vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -249,6 +231,83 @@ void dump_mm(const struct mm_struct *mm)
}
EXPORT_SYMBOL(dump_mm);
+void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
+{
+ if (reason)
+ pr_warn("vmg %px dumped because: %s\n", vmg, reason);
+
+ if (!vmg) {
+ pr_warn("vmg %px state: (NULL)\n", vmg);
+ return;
+ }
+
+ pr_warn("vmg %px state: mm %px pgoff %lx\n"
+ "vmi %px [%lx,%lx)\n"
+ "prev %px middle %px next %px target %px\n"
+ "start %lx end %lx flags %lx\n"
+ "file %px anon_vma %px policy %px\n"
+ "uffd_ctx %px\n"
+ "anon_name %px\n"
+ "state %x\n"
+ "just_expand %d\n"
+ "__adjust_middle_start %d __adjust_next_start %d\n"
+ "__remove_middle %d __remove_next %d\n",
+ vmg, vmg->mm, vmg->pgoff,
+ vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
+ vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
+ vmg->prev, vmg->middle, vmg->next, vmg->target,
+ vmg->start, vmg->end, vmg->vm_flags,
+ vmg->file, vmg->anon_vma, vmg->policy,
+#ifdef CONFIG_USERFAULTFD
+ vmg->uffd_ctx.ctx,
+#else
+ (void *)0,
+#endif
+ vmg->anon_name,
+ (int)vmg->state,
+ vmg->just_expand,
+ vmg->__adjust_middle_start, vmg->__adjust_next_start,
+ vmg->__remove_middle, vmg->__remove_next);
+
+ if (vmg->mm) {
+ pr_warn("vmg %px mm:\n", vmg);
+ dump_mm(vmg->mm);
+ } else {
+ pr_warn("vmg %px mm: (NULL)\n", vmg);
+ }
+
+ if (vmg->prev) {
+ pr_warn("vmg %px prev:\n", vmg);
+ dump_vma(vmg->prev);
+ } else {
+ pr_warn("vmg %px prev: (NULL)\n", vmg);
+ }
+
+ if (vmg->middle) {
+ pr_warn("vmg %px middle:\n", vmg);
+ dump_vma(vmg->middle);
+ } else {
+ pr_warn("vmg %px middle: (NULL)\n", vmg);
+ }
+
+ if (vmg->next) {
+ pr_warn("vmg %px next:\n", vmg);
+ dump_vma(vmg->next);
+ } else {
+ pr_warn("vmg %px next: (NULL)\n", vmg);
+ }
+
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ if (vmg->vmi) {
+ pr_warn("vmg %px vmi:\n", vmg);
+ vma_iter_dump_tree(vmg->vmi);
+ } else {
+ pr_warn("vmg %px vmi: (NULL)\n", vmg);
+ }
+#endif
+}
+EXPORT_SYMBOL(dump_vmg);
+
static bool page_init_poisoning __read_mostly = true;
static int __init setup_vm_debug(char *str)
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index d46acf989dde..6a26eca546c3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value: %s\n", buf);
return 0;
}
_debug_guardpage_minorder = res;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bc748f700a9e..830107b6dd08 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -20,7 +20,6 @@
#include <linux/mman.h>
#include <linux/mm_types.h>
#include <linux/module.h>
-#include <linux/pfn_t.h>
#include <linux/printk.h>
#include <linux/pgtable.h>
#include <linux/random.h>
@@ -73,6 +72,8 @@ struct pgtable_debug_args {
unsigned long fixed_pud_pfn;
unsigned long fixed_pmd_pfn;
unsigned long fixed_pte_pfn;
+
+ swp_entry_t swp_entry;
};
static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
@@ -348,12 +349,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
vaddr &= HPAGE_PUD_MASK;
pud = pfn_pud(args->pud_pfn, args->page_prot);
- /*
- * Some architectures have debug checks to make sure
- * huge pud mapping are only found with devmap entries
- * For now test with only devmap entries.
- */
- pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -366,7 +361,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
- pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -384,7 +378,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
- pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
@@ -693,53 +686,6 @@ static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(struct pgtable_debug_args *args)
-{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
-
- pr_debug("Validating PTE devmap\n");
- WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
-{
- pmd_t pmd;
-
- if (!has_transparent_hugepage())
- return;
-
- pr_debug("Validating PMD devmap\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
- WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(struct pgtable_debug_args *args)
-{
- pud_t pud;
-
- if (!has_transparent_pud_hugepage())
- return;
-
- pr_debug("Validating PUD devmap\n");
- pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
- WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
-}
-#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#else
-static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-
static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
@@ -754,12 +700,15 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
+ pte_t pte;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
pr_debug("Validating PTE swap soft dirty\n");
+ pte = swp_entry_to_pte(args->swp_entry);
+ WARN_ON(!is_swap_pte(pte));
+
WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
}
@@ -793,7 +742,9 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
return;
pr_debug("Validating PMD swap soft dirty\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
+ pmd = swp_entry_to_pmd(args->swp_entry);
+ WARN_ON(!is_swap_pmd(pmd));
+
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
@@ -804,17 +755,11 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
{
- unsigned long max_swap_offset;
swp_entry_t entry, entry2;
pte_t pte;
pr_debug("Validating PTE swap exclusive\n");
-
- /* See generic_max_swapfile_size(): probe the maximum offset */
- max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
-
- /* Create a swp entry with all possible bits set */
- entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+ entry = args->swp_entry;
pte = swp_entry_to_pte(entry);
WARN_ON(pte_swp_exclusive(pte));
@@ -838,30 +783,34 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
- swp_entry_t swp;
- pte_t pte;
+ swp_entry_t arch_entry;
+ pte_t pte1, pte2;
pr_debug("Validating PTE swap\n");
- pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
- swp = __pte_to_swp_entry(pte);
- pte = __swp_entry_to_pte(swp);
- WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
+ pte1 = swp_entry_to_pte(args->swp_entry);
+ WARN_ON(!is_swap_pte(pte1));
+
+ arch_entry = __pte_to_swp_entry(pte1);
+ pte2 = __swp_entry_to_pte(arch_entry);
+ WARN_ON(memcmp(&pte1, &pte2, sizeof(pte1)));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static void __init pmd_swap_tests(struct pgtable_debug_args *args)
{
- swp_entry_t swp;
- pmd_t pmd;
+ swp_entry_t arch_entry;
+ pmd_t pmd1, pmd2;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD swap\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
- swp = __pmd_to_swp_entry(pmd);
- pmd = __swp_entry_to_pmd(swp);
- WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
+ pmd1 = swp_entry_to_pmd(args->swp_entry);
+ WARN_ON(!is_swap_pmd(pmd1));
+
+ arch_entry = __pmd_to_swp_entry(pmd1);
+ pmd2 = __swp_entry_to_pmd(arch_entry);
+ WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1)));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
@@ -910,26 +859,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_HUGETLB_PAGE
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
- struct page *page;
pte_t pte;
pr_debug("Validating HugeTLB basic\n");
- /*
- * Accessing the page associated with the pfn is safe here,
- * as it was previously derived from a real kernel symbol.
- */
- page = pfn_to_page(args->fixed_pmd_pfn);
- pte = mk_huge_pte(page, args->page_prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
+ pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS);
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ WARN_ON(!pte_huge(pte));
+#endif
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
-
-#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
-
- WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
@@ -1049,29 +990,34 @@ static void __init destroy_args(struct pgtable_debug_args *args)
/* Free page table entries */
if (args->start_ptep) {
+ pmd_clear(args->pmdp);
pte_free(args->mm, args->start_ptep);
mm_dec_nr_ptes(args->mm);
}
if (args->start_pmdp) {
+ pud_clear(args->pudp);
pmd_free(args->mm, args->start_pmdp);
mm_dec_nr_pmds(args->mm);
}
if (args->start_pudp) {
+ p4d_clear(args->p4dp);
pud_free(args->mm, args->start_pudp);
mm_dec_nr_puds(args->mm);
}
- if (args->start_p4dp)
+ if (args->start_p4dp) {
+ pgd_clear(args->pgdp);
p4d_free(args->mm, args->start_p4dp);
+ }
/* Free vma and mm struct */
if (args->vma)
vm_area_free(args->vma);
if (args->mm)
- mmdrop(args->mm);
+ mmput(args->mm);
}
static struct page * __init
@@ -1174,6 +1120,7 @@ static void __init init_fixed_pfns(struct pgtable_debug_args *args)
static int __init init_args(struct pgtable_debug_args *args)
{
+ unsigned long max_swap_offset;
struct page *page = NULL;
int ret = 0;
@@ -1256,6 +1203,11 @@ static int __init init_args(struct pgtable_debug_args *args)
init_fixed_pfns(args);
+ /* See generic_max_swapfile_size(): probe the maximum offset */
+ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+ /* Create a swp entry with all possible bits set */
+ args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+
/*
* Allocate (huge) pages because some of the tests need to access
* the data in the pages. The corresponding tests will be skipped
@@ -1341,10 +1293,6 @@ static int __init debug_vm_pgtable(void)
pte_protnone_tests(&args);
pmd_protnone_tests(&args);
- pte_devmap_tests(&args);
- pmd_devmap_tests(&args);
- pud_devmap_tests(&args);
-
pte_soft_dirty_tests(&args);
pmd_soft_dirty_tests(&args);
pte_swap_soft_dirty_tests(&args);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5d8af6e29127 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,16 +200,17 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of coherent DMA memory blocks.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
- * may be used to allocate memory. Such memory will all have "consistent"
+ * may be used to allocate memory. Such memory will all have coherent
* DMA mappings, accessible by the device and its driver without using
* cache flushing primitives. The actual size of blocks allocated may be
* larger than requested because of alignment.
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
@@ -392,7 +395,7 @@ void dma_pool_destroy(struct dma_pool *pool)
EXPORT_SYMBOL(dma_pool_destroy);
/**
- * dma_pool_alloc - get a block of consistent memory
+ * dma_pool_alloc - get a block of coherent memory
* @pool: dma pool that will produce the block
* @mem_flags: GFP_* bitmask
* @handle: pointer to dma address of block
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index ce06b2884789..ff35b84a7b50 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size,
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
-void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+/*
+ * If no empty slot, handle that and return -ENOMEM.
+ */
+int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
{
unsigned long slop, clen;
char *p;
@@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
if (clen > MAX_MAP_CHUNK - slop)
clen = MAX_MAP_CHUNK - slop;
p = early_memremap(src & PAGE_MASK, clen + slop);
+ if (!p)
+ return -ENOMEM;
memcpy(dest, p + slop, clen);
early_memunmap(p, clen + slop);
dest += clen;
src += clen;
size -= clen;
}
+ return 0;
}
#else /* CONFIG_MMU */
diff --git a/mm/execmem.c b/mm/execmem.c
index 317b6a8d35be..0822305413ec 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -93,8 +93,15 @@ struct execmem_cache {
struct mutex mutex;
struct maple_tree busy_areas;
struct maple_tree free_areas;
+ unsigned int pending_free_cnt; /* protected by mutex */
};
+/* delay to schedule asynchronous free if fast path free fails */
+#define FREE_DELAY (msecs_to_jiffies(10))
+
+/* mark entries in busy_areas that should be freed asynchronously */
+#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
+
static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@@ -130,6 +137,27 @@ err_restore:
return err;
}
+static int execmem_force_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
static void execmem_cache_clean(struct work_struct *work)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
@@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
-static int execmem_cache_add(void *ptr, size_t size)
+static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper;
void *area = NULL;
- int err;
lower = addr;
upper = addr + size - 1;
- mutex_lock(mutex);
area = mas_walk(&mas);
if (area && mas.last == addr - 1)
lower = mas.index;
@@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last;
mas_set_range(&mas, lower, upper);
- err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
- mutex_unlock(mutex);
- if (err)
- return err;
+ return mas_store_gfp(&mas, (void *)lower, gfp_mask);
+}
- return 0;
+static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
+{
+ guard(mutex)(&execmem_cache.mutex);
+
+ return execmem_cache_add_locked(ptr, size, gfp_mask);
}
static bool within_range(struct execmem_range *range, struct ma_state *mas,
@@ -257,7 +284,6 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
- unsigned long start, end;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -265,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
alloc_size = round_up(size, PMD_SIZE);
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p) {
+ alloc_size = size;
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ }
+
if (!p)
return err;
@@ -273,28 +304,20 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
goto err_free_mem;
/* fill memory with instructions that will trap */
- execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
-
- start = (unsigned long)p;
- end = start + alloc_size;
+ execmem_fill_trapping_insns(p, alloc_size);
- vunmap_range(start, end);
-
- err = execmem_set_direct_map_valid(vm, false);
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
- err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
- PMD_SHIFT);
+ err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err)
- goto err_free_mem;
-
- err = execmem_cache_add(p, alloc_size);
- if (err)
- goto err_free_mem;
+ goto err_reset_direct_map;
return 0;
+err_reset_direct_map:
+ execmem_set_direct_map_valid(vm, true);
err_free_mem:
vfree(p);
return err;
@@ -316,35 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size);
}
+static inline bool is_pending_free(void *ptr)
+{
+ return ((unsigned long)ptr & PENDING_FREE_MASK);
+}
+
+static inline void *pending_free_set(void *ptr)
+{
+ return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
+}
+
+static inline void *pending_free_clear(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
+}
+
+static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
+{
+ size_t size = mas_range_len(mas);
+ int err;
+
+ err = execmem_force_rw(ptr, size);
+ if (err)
+ return err;
+
+ execmem_fill_trapping_insns(ptr, size);
+ execmem_restore_rox(ptr, size);
+
+ err = execmem_cache_add_locked(ptr, size, gfp_mask);
+ if (err)
+ return err;
+
+ mas_store_gfp(mas, NULL, gfp_mask);
+ return 0;
+}
+
+static void execmem_cache_free_slow(struct work_struct *work);
+static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
+
+static void execmem_cache_free_slow(struct work_struct *work)
+{
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas, busy_areas, 0, ULONG_MAX);
+ void *area;
+
+ guard(mutex)(&execmem_cache.mutex);
+
+ if (!execmem_cache.pending_free_cnt)
+ return;
+
+ mas_for_each(&mas, area, ULONG_MAX) {
+ if (!is_pending_free(area))
+ continue;
+
+ area = pending_free_clear(area);
+ if (__execmem_cache_free(&mas, area, GFP_KERNEL))
+ continue;
+
+ execmem_cache.pending_free_cnt--;
+ }
+
+ if (execmem_cache.pending_free_cnt)
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ else
+ schedule_work(&execmem_cache_clean_work);
+}
+
static bool execmem_cache_free(void *ptr)
{
struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, busy_areas, addr, addr);
- size_t size;
void *area;
+ int err;
+
+ guard(mutex)(&execmem_cache.mutex);
- mutex_lock(mutex);
area = mas_walk(&mas);
- if (!area) {
- mutex_unlock(mutex);
+ if (!area)
return false;
- }
- size = mas_range_len(&mas);
-
- mas_store_gfp(&mas, NULL, GFP_KERNEL);
- mutex_unlock(mutex);
-
- execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
- execmem_cache_add(ptr, size);
+ err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY);
+ if (err) {
+ /*
+ * mas points to exact slot we've got the area from, nothing
+ * else can modify the tree because of the mutex, so there
+ * won't be any allocations in mas_store_gfp() and it will just
+ * change the pointer.
+ */
+ area = pending_free_set(area);
+ mas_store_gfp(&mas, area, GFP_KERNEL);
+ execmem_cache.pending_free_cnt++;
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ return true;
+ }
schedule_work(&execmem_cache_clean_work);
return true;
}
+
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+/*
+ * when ROX cache is not used the permissions defined by architectures for
+ * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
+ * be writable anyway
+ */
+static inline int execmem_force_rw(void *ptr, size_t size)
+{
+ return 0;
+}
+
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
return NULL;
@@ -362,7 +467,9 @@ void *execmem_alloc(enum execmem_type type, size_t size)
bool use_cache = range->flags & EXECMEM_ROX_CACHE;
unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot;
- void *p;
+ void *p = NULL;
+
+ size = PAGE_ALIGN(size);
if (use_cache)
p = execmem_cache_alloc(range, size);
@@ -372,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
return kasan_reset_tag(p);
}
+void *execmem_alloc_rw(enum execmem_type type, size_t size)
+{
+ void *p __free(execmem) = execmem_alloc(type, size);
+ int err;
+
+ if (!p)
+ return NULL;
+
+ err = execmem_force_rw(p, size);
+ if (err)
+ return NULL;
+
+ return no_free_ptr(p);
+}
+
void execmem_free(void *ptr)
{
/*
@@ -384,11 +506,6 @@ void execmem_free(void *ptr)
vfree(ptr);
}
-void *execmem_update_copy(void *dst, const void *src, size_t size)
-{
- return text_poke_copy(dst, src, size);
-}
-
bool execmem_is_rox(enum execmem_type type)
{
return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
diff --git a/mm/filemap.c b/mm/filemap.c
index 440922a7d8f1..bef514b60e0b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,6 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
+#include <linux/sysctl.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -141,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
+ /* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@@ -226,15 +227,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
void (*free_folio)(struct folio *);
- int refs = 1;
free_folio = mapping->a_ops->free_folio;
if (free_folio)
free_folio(folio);
- if (folio_test_large(folio))
- refs = folio_nr_pages(folio);
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, folio_nr_pages(folio));
}
/**
@@ -441,6 +439,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
+ * filemap_fdatawrite_range_kick - start writeback on a range
+ * @mapping: target address_space
+ * @start: index to start writeback on
+ * @end: last (inclusive) index for writeback
+ *
+ * This is a non-integrity writeback helper, to start writing back folios
+ * for the indicated range.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
+
+/**
* filemap_flush - mostly a non-blocking flush
* @mapping: target address_space
*
@@ -841,11 +857,10 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio);
noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
- XA_STATE(xas, &mapping->i_pages, index);
- void *alloced_shadow = NULL;
- int alloced_order = 0;
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
bool huge;
long nr;
+ unsigned int forder = folio_order(folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
@@ -854,7 +869,6 @@ noinline int __filemap_add_folio(struct address_space *mapping,
mapping_set_update(&xas, mapping);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
- xas_set_order(&xas, index, folio_order(folio));
huge = folio_test_hugetlb(folio);
nr = folio_nr_pages(folio);
@@ -864,7 +878,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
folio->index = xas.xa_index;
for (;;) {
- int order = -1, split_order = 0;
+ int order = -1;
void *entry, *old = NULL;
xas_lock_irq(&xas);
@@ -882,21 +896,25 @@ noinline int __filemap_add_folio(struct address_space *mapping,
order = xas_get_order(&xas);
}
- /* entry may have changed before we re-acquire the lock */
- if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
-
if (old) {
- if (order > 0 && order > folio_order(folio)) {
+ if (order > 0 && order > forder) {
+ unsigned int split_order = max(forder,
+ xas_try_split_min_order(order));
+
/* How to handle large swap entries? */
BUG_ON(shmem_mapping(mapping));
- if (!alloced_order) {
- split_order = order;
- goto unlock;
+
+ while (order > forder) {
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, order);
+ if (xas_error(&xas))
+ goto unlock;
+ order = split_order;
+ split_order =
+ max(xas_try_split_min_order(
+ split_order),
+ forder);
}
- xas_split(&xas, old, order);
xas_reset(&xas);
}
if (shadowp)
@@ -920,17 +938,6 @@ noinline int __filemap_add_folio(struct address_space *mapping,
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
@@ -942,7 +949,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
+ /* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
@@ -1059,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio)
return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
+/* How many times do we accept lock stealing from under a waiter? */
+static int sysctl_page_lock_unfairness = 5;
+static const struct ctl_table filemap_sysctl_table[] = {
+ {
+ .procname = "page_lock_unfairness",
+ .data = &sysctl_page_lock_unfairness,
+ .maxlen = sizeof(sysctl_page_lock_unfairness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+};
+
void __init pagecache_init(void)
{
int i;
@@ -1067,6 +1087,7 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
+ register_sysctl_init("vm", filemap_sysctl_table);
}
/*
@@ -1214,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
return true;
}
-/* How many times do we accept lock stealing from under a waiter? */
-int sysctl_page_lock_unfairness = 5;
-
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
int state, enum behavior behavior)
{
@@ -1360,7 +1378,7 @@ repeat:
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
- * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
* should be called while holding the ptl for the migration entry referencing
* the page.
@@ -1464,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state)
}
/**
- * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
- * @folio: Folio defining the wait queue of interest
- * @waiter: Waiter to add to the queue
- *
- * Add an arbitrary @waiter to the wait queue for the nominated @folio.
- */
-void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
-{
- wait_queue_head_t *q = folio_waitqueue(folio);
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_entry_tail(q, waiter);
- folio_set_waiters(folio);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(folio_add_wait_queue);
-
-/**
* folio_unlock - Unlock a locked folio.
* @folio: The folio.
*
@@ -1590,6 +1589,43 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
+/*
+ * If folio was marked as dropbehind, then pages should be dropped when writeback
+ * completes. Do that now. If we fail, it's likely because of a big folio -
+ * just reset dropbehind for that case and latter completions should invalidate.
+ */
+static void filemap_end_dropbehind_write(struct folio *folio)
+{
+ if (!folio_test_dropbehind(folio))
+ return;
+
+ /*
+ * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
+ * but can happen if normal writeback just happens to find dirty folios
+ * that were created as part of uncached writeback, and that writeback
+ * would otherwise not need non-IRQ handling. Just skip the
+ * invalidation in that case.
+ */
+ if (in_task() && folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+}
+
/**
* folio_end_writeback - End writeback against a folio.
* @folio: The folio.
@@ -1623,6 +1659,8 @@ void folio_end_writeback(struct folio *folio)
folio_get(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
+
+ filemap_end_dropbehind_write(folio);
acct_reclaim_writeback(folio);
folio_put(folio);
}
@@ -1740,8 +1778,9 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);
+ unsigned long nr = max_scan;
- while (max_scan--) {
+ while (nr--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
return xas.xa_index;
@@ -1946,6 +1985,8 @@ no_page:
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
__folio_set_referenced(folio);
+ if (fgp_flags & FGP_DONTCACHE)
+ __folio_set_dropbehind(folio);
err = filemap_add_folio(mapping, folio, index, gfp);
if (!err)
@@ -1956,8 +1997,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -1968,6 +2020,9 @@ no_page:
if (!folio)
return ERR_PTR(-ENOENT);
+ /* not an uncached lookup, clear uncached if set */
+ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
+ folio_clear_dropbehind(folio);
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);
@@ -2201,6 +2256,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
@@ -2450,18 +2506,22 @@ unlock_mapping:
return error;
}
-static int filemap_create_folio(struct file *file,
- struct address_space *mapping, loff_t pos,
- struct folio_batch *fbatch)
+static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
struct folio *folio;
int error;
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t index;
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+ return -EAGAIN;
+
folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
if (!folio)
return -ENOMEM;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ __folio_set_dropbehind(folio);
/*
* Protect against truncate / hole punch. Grabbing invalidate_lock
@@ -2477,7 +2537,7 @@ static int filemap_create_folio(struct file *file,
* well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
- index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
+ index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
@@ -2485,7 +2545,8 @@ static int filemap_create_folio(struct file *file,
if (error)
goto error;
- error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
+ error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
+ folio);
if (error)
goto error;
@@ -2506,6 +2567,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ ractl.dropbehind = 1;
page_cache_async_ra(&ractl, folio, last_index - folio->index);
return 0;
}
@@ -2515,7 +2578,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
- struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
@@ -2530,20 +2592,21 @@ retry:
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
+ DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
+
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
if (iocb->ki_flags & IOCB_NOWAIT)
flags = memalloc_noio_save();
- page_cache_sync_readahead(mapping, ra, filp, index,
- last_index - index);
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ ractl.dropbehind = 1;
+ page_cache_sync_ra(&ractl, last_index - index);
if (iocb->ki_flags & IOCB_NOWAIT)
memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
- return -EAGAIN;
- err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
+ err = filemap_create_folio(iocb, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
@@ -2584,6 +2647,18 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
+static void filemap_end_dropbehind_read(struct folio *folio)
+{
+ if (!folio_test_dropbehind(folio))
+ return;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+}
+
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
@@ -2697,8 +2772,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
}
}
put_folios:
- for (i = 0; i < folio_batch_count(&fbatch); i++)
- folio_put(fbatch.folios[i]);
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ filemap_end_dropbehind_read(folio);
+ folio_put(folio);
+ }
folio_batch_init(&fbatch);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
@@ -2839,8 +2918,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
- while (spliced < size &&
- !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (spliced < size && !pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
@@ -2897,7 +2975,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -2957,7 +3035,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
goto out;
}
@@ -3138,8 +3216,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
- unsigned long vm_flags = vmf->vma->vm_flags;
- unsigned int mmap_miss;
+ vm_flags_t vm_flags = vmf->vma->vm_flags;
+ unsigned short mmap_miss;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
@@ -3154,13 +3232,17 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (!(vm_flags & VM_RAND_READ))
ra->size *= 2;
ra->async_size = HPAGE_PMD_NR;
- page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+ ra->order = HPAGE_PMD_ORDER;
+ page_cache_ra_order(&ractl, ra);
return fpin;
}
#endif
- /* If we don't want any read-ahead, don't bother */
- if (vm_flags & VM_RAND_READ)
+ /*
+ * If we don't want any read-ahead, don't bother. VM_EXEC case below is
+ * already intended for random access.
+ */
+ if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
return fpin;
if (!ra->ra_pages)
return fpin;
@@ -3183,15 +3265,43 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
- /*
- * mmap read-around
- */
+ if (vm_flags & VM_EXEC) {
+ /*
+ * Allow arch to request a preferred minimum folio order for
+ * executable memory. This can often be beneficial to
+ * performance if (e.g.) arm64 can contpte-map the folio.
+ * Executable memory rarely benefits from readahead, due to its
+ * random access nature, so set async_size to 0.
+ *
+ * Limit to the boundaries of the VMA to avoid reading in any
+ * pad that might exist between sections, which would be a waste
+ * of memory.
+ */
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long start = vma->vm_pgoff;
+ unsigned long end = start + vma_pages(vma);
+ unsigned long ra_end;
+
+ ra->order = exec_folio_order();
+ ra->start = round_down(vmf->pgoff, 1UL << ra->order);
+ ra->start = max(ra->start, start);
+ ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
+ ra_end = min(ra_end, end);
+ ra->size = ra_end - ra->start;
+ ra->async_size = 0;
+ } else {
+ /*
+ * mmap read-around
+ */
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->size = ra->ra_pages;
+ ra->async_size = ra->ra_pages / 4;
+ ra->order = 0;
+ }
+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
- ra->size = ra->ra_pages;
- ra->async_size = ra->ra_pages / 4;
ractl._index = ra->start;
- page_cache_ra_order(&ractl, ra, 0);
+ page_cache_ra_order(&ractl, ra);
return fpin;
}
@@ -3207,7 +3317,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file_ra_state *ra = &file->f_ra;
DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
struct file *fpin = NULL;
- unsigned int mmap_miss;
+ unsigned short mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
@@ -3465,7 +3575,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
struct page *page = folio_file_page(folio, start);
- vm_fault_t ret = do_set_pmd(vmf, page);
+ vm_fault_t ret = do_set_pmd(vmf, folio, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
folio_unlock(folio);
@@ -3527,7 +3637,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned int *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
@@ -3589,7 +3699,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned long *rss, unsigned int *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3631,14 +3741,29 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
struct folio *folio;
vm_fault_t ret = 0;
unsigned long rss = 0;
- unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
+ unsigned int nr_pages = 0, folio_type;
+ unsigned short mmap_miss = 0, mmap_miss_saved;
+ bool can_map_large;
rcu_read_lock();
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
if (!folio)
goto out;
- if (filemap_map_pmd(vmf, folio, start_pgoff)) {
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
+ end_pgoff = min(end_pgoff, file_end);
+
+ /*
+ * Do not allow to map with PTEs beyond i_size and with PMD
+ * across i_size to preserve SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ can_map_large = shmem_mapping(mapping) ||
+ file_end >= folio_next_index(folio);
+
+ if (can_map_large && filemap_map_pmd(vmf, folio, start_pgoff)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
@@ -3651,10 +3776,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}
- file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
- if (end_pgoff > file_end)
- end_pgoff = file_end;
-
folio_type = mm_counter_file(folio);
do {
unsigned long end;
@@ -3737,6 +3858,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->read_folio)
+ return -ENOEXEC;
+ file_accessed(file);
+ desc->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
/*
* This is for filesystems which do not implement ->writepage.
*/
@@ -3746,6 +3879,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
return generic_file_mmap(file, vma);
}
+
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+ if (is_shared_maywrite(desc->vm_flags))
+ return -EINVAL;
+ return generic_file_mmap_prepare(desc);
+}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
@@ -3755,15 +3895,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ return -ENOSYS;
+}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_MMU */
EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_mmap_prepare);
EXPORT_SYMBOL(generic_file_readonly_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);
static struct folio *do_read_cache_folio(struct address_space *mapping,
pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
@@ -4027,23 +4177,12 @@ retry:
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
- status = -EFAULT;
- break;
- }
-
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
- status = a_ops->write_begin(file, mapping, pos, bytes,
+ status = a_ops->write_begin(iocb, mapping, pos, bytes,
&folio, &fsdata);
if (unlikely(status < 0))
break;
@@ -4055,10 +4194,16 @@ retry:
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
- status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
folio, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
@@ -4080,6 +4225,16 @@ retry:
bytes = copied;
goto retry;
}
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
} else {
pos += status;
written += status;
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 1d1832e2a599..45540942d148 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page)
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
-void wait_for_stable_page(struct page *page)
-{
- return folio_wait_stable(page_folio(page));
-}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
-
void mark_page_accessed(struct page *page)
{
folio_mark_accessed(page_folio(page));
@@ -90,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);
-
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index)
-{
- return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/gup.c b/mm/gup.c
index 569a4d82012d..0bc4d140fc07 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "swap.h"
struct follow_page_context {
struct dev_pagemap *pgmap;
@@ -63,11 +64,11 @@ static inline void sanity_check_pinned_pages(struct page **pages,
!folio_test_anon(folio))
continue;
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
- VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+ VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
else
/* Either a PTE-mapped or a PMD-mapped THP. */
- VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
+ !PageAnonExclusive(page), page);
}
}
@@ -96,8 +97,7 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -110,14 +110,13 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
if (is_zero_folio(folio))
return;
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
atomic_sub(refs, &folio->_pincount);
else
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
}
/**
@@ -166,7 +165,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
* Increment the normal page refcount field at least once,
* so that the page really is pinned.
*/
- if (folio_test_large(folio)) {
+ if (folio_has_pincount(folio)) {
folio_ref_add(folio, refs);
atomic_add(refs, &folio->_pincount);
} else {
@@ -225,7 +224,7 @@ void folio_add_pin(struct folio *folio)
* page refcount field at least once, so that the page really is
* pinned.
*/
- if (folio_test_large(folio)) {
+ if (folio_has_pincount(folio)) {
WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
folio_ref_inc(folio);
atomic_inc(&folio->_pincount);
@@ -565,8 +564,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
*/
if (unlikely((flags & FOLL_LONGTERM) &&
!folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
return NULL;
}
@@ -578,7 +576,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
* is pinned. That's why the refcount from the earlier
* try_get_folio() is left intact.
*/
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
atomic_add(refs, &folio->_pincount);
else
folio_ref_add(folio,
@@ -596,6 +594,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
}
#endif /* CONFIG_HAVE_GUP_FAST */
+/* Common code for can_follow_write_* */
+static inline bool can_follow_write_common(struct page *page,
+ struct vm_area_struct *vma, unsigned int flags)
+{
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags, unsigned long address)
{
@@ -622,6 +647,18 @@ static struct page *no_page_table(struct vm_area_struct *vma,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
+static inline bool can_follow_write_pud(pud_t pud, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* If the pud is writable, we can write to the page. */
+ if (pud_write(pud))
+ return true;
+
+ return can_follow_write_common(page, vma, flags);
+}
+
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
int flags, struct follow_page_context *ctx)
@@ -634,38 +671,17 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
assert_spin_locked(pud_lockptr(mm, pudp));
- if ((flags & FOLL_WRITE) && !pud_write(pud))
+ if (!pud_present(pud))
return NULL;
- if (!pud_present(pud))
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
return NULL;
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-
- if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- pud_devmap(pud)) {
- /*
- * device mapped pages can only be returned if the caller
- * will manage the page reference count.
- *
- * At least one of FOLL_GET | FOLL_PIN must be set, so
- * assert that here:
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
-
- ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
- if (!ctx->pgmap)
- return ERR_PTR(-EFAULT);
- }
-
page = pfn_to_page(pfn);
- if (!pud_devmap(pud) && !pud_write(pud) &&
- gup_must_unshare(vma, flags, page))
+ if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
ret = try_grab_folio(page_folio(page), 1, flags);
@@ -686,27 +702,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
if (pmd_write(pmd))
return true;
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;
/* ... and a write-fault isn't required for other reasons. */
@@ -742,8 +738,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
@@ -807,27 +803,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
if (pte_write(pte))
return true;
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;
/* ... and a write-fault isn't required for other reasons. */
@@ -847,11 +823,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte_t *ptep, pte;
int ret;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return ERR_PTR(-EINVAL);
-
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
return no_page_table(vma, flags, address);
@@ -864,8 +835,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
page = vm_normal_page(vma, address, pte);
/*
- * We only care about anon pages in can_follow_write_pte() and don't
- * have to worry about pte_devmap() because they are never anon.
+ * We only care about anon pages in can_follow_write_pte().
*/
if ((flags & FOLL_WRITE) &&
!can_follow_write_pte(pte, page, vma, flags)) {
@@ -873,18 +843,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
- if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
- /*
- * Only return device mapping pages in the FOLL_GET or FOLL_PIN
- * case since they are only valid while holding the pgmap
- * reference.
- */
- *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
- if (*pgmap)
- page = pte_page(pte);
- else
- goto no_page;
- } else if (unlikely(!page)) {
+ if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
@@ -906,8 +865,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
ret = try_grab_folio(folio, 1, flags);
@@ -966,14 +925,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
- if (pmd_devmap(pmdval)) {
- ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
- spin_unlock(ptl);
- if (page)
- return page;
- return no_page_table(vma, flags, address);
- }
if (likely(!pmd_leaf(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
@@ -1109,10 +1060,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
/* user gate pages are read-only */
if (gup_flags & FOLL_WRITE)
return -EFAULT;
- if (address > TASK_SIZE)
- pgd = pgd_offset_k(address);
- else
- pgd = pgd_offset_gate(mm, address);
+ pgd = pgd_offset(mm, address);
if (pgd_none(*pgd))
return -EFAULT;
p4d = p4d_offset(pgd, address);
@@ -1190,7 +1138,7 @@ static int faultin_page(struct vm_area_struct *vma,
if (unshare) {
fault_flags |= FAULT_FLAG_UNSHARE;
/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
- VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+ VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
@@ -1283,6 +1231,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -1294,9 +1245,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
- /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
- if (is_vm_hugetlb_page(vma))
- return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
@@ -1435,7 +1383,11 @@ static long __get_user_pages(struct mm_struct *mm,
start = untagged_addr_remote(mm, start);
- VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+ VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET));
do {
struct page *page;
@@ -1766,10 +1718,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
}
/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
- if (!*locked) {
- BUG_ON(ret < 0);
- BUG_ON(ret >= nr_pages);
- }
+ VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
if (ret > 0) {
nr_pages -= ret;
@@ -1814,7 +1763,6 @@ retry:
ret = mmap_read_lock_killable(mm);
if (ret) {
- BUG_ON(ret > 0);
if (!pages_done)
pages_done = ret;
break;
@@ -1825,11 +1773,11 @@ retry:
pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
- BUG_ON(ret != 0);
+ VM_WARN_ON_ONCE(ret != 0);
goto retry;
}
if (ret != 1) {
- BUG_ON(ret > 1);
+ VM_WARN_ON_ONCE(ret > 1);
if (!pages_done)
pages_done = ret;
break;
@@ -1891,10 +1839,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
int gup_flags;
long ret;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
+ VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
+ VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
/*
@@ -1963,8 +1911,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start,
int gup_flags;
long ret;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
mmap_assert_locked(mm);
/*
@@ -2054,7 +2002,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
{
struct vm_area_struct *vma;
bool must_unlock = false;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
long i;
if (!nr_pages)
@@ -2117,28 +2065,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
- char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
if (unlikely(size == 0))
return 0;
if (!user_write_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_put_user(0, uaddr, out);
- uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_put_user(0, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_put_user(0, (char __user *)cur, out);
out:
user_write_access_end();
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
@@ -2192,26 +2134,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)uaddr, end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0;
- end = PAGE_ALIGN(start + size);
- if (end < start)
- end = 0;
mmap_read_lock(mm);
- do {
- if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
break;
- start = (start + PAGE_SIZE) & PAGE_MASK;
- } while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
@@ -2226,30 +2166,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable);
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
- const char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
volatile char c;
if (unlikely(size == 0))
return 0;
if (!user_read_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_get_user(c, uaddr, out);
- uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_get_user(c, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_get_user(c, (const char __user *)cur, out);
out:
user_read_access_end();
(void)c;
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
@@ -2257,6 +2191,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2269,13 +2204,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2320,38 +2254,70 @@ static void pofs_unpin(struct pages_or_folios *pofs)
unpin_user_pages(pofs->pages, pofs->nr_entries);
}
+static struct folio *pofs_next_folio(struct folio *folio,
+ struct pages_or_folios *pofs, long *index_ptr)
+{
+ long i = *index_ptr + 1;
+
+ if (!pofs->has_folios && folio_test_large(folio)) {
+ const unsigned long start_pfn = folio_pfn(folio);
+ const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);
+
+ for (; i < pofs->nr_entries; i++) {
+ unsigned long pfn = page_to_pfn(pofs->pages[i]);
+
+ /* Is this page part of this folio? */
+ if (pfn < start_pfn || pfn >= end_pfn)
+ break;
+ }
+ }
+
+ if (unlikely(i == pofs->nr_entries))
+ return NULL;
+ *index_ptr = i;
+
+ return pofs_get_folio(pofs, i);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static void collect_longterm_unpinnable_folios(
+static unsigned long collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
- struct folio *prev_folio = NULL;
- bool drain_allow = true;
- unsigned long i;
-
- for (i = 0; i < pofs->nr_entries; i++) {
- struct folio *folio = pofs_get_folio(pofs, i);
+ unsigned long collected = 0;
+ struct folio *folio;
+ int drained = 0;
+ long i = 0;
- if (folio == prev_folio)
- continue;
- prev_folio = folio;
+ for (folio = pofs_get_folio(pofs, i); folio;
+ folio = pofs_next_folio(folio, pofs, &i)) {
if (folio_is_longterm_pinnable(folio))
continue;
+ collected++;
+
if (folio_is_device_coherent(folio))
continue;
if (folio_test_hugetlb(folio)) {
- isolate_hugetlb(folio, movable_folio_list);
+ folio_isolate_hugetlb(folio, movable_folio_list);
continue;
}
- if (!folio_test_lru(folio) && drain_allow) {
+ if (drained == 0 && folio_may_be_lru_cached(folio) &&
+ folio_ref_count(folio) !=
+ folio_expected_ref_count(folio) + 1) {
+ lru_add_drain();
+ drained = 1;
+ }
+ if (drained == 1 && folio_may_be_lru_cached(folio) &&
+ folio_ref_count(folio) !=
+ folio_expected_ref_count(folio) + 1) {
lru_add_drain_all();
- drain_allow = false;
+ drained = 2;
}
if (!folio_isolate_lru(folio))
@@ -2362,6 +2328,8 @@ static void collect_longterm_unpinnable_folios(
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
+
+ return collected;
}
/*
@@ -2438,9 +2406,11 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
+ unsigned long collected;
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
- if (list_empty(&movable_folio_list))
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
@@ -2760,7 +2730,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*
* *) ptes can be read atomically by the architecture.
*
- * *) access_ok is sufficient to validate userspace address ranges.
+ * *) valid user addesses are below TASK_MAX_SIZE
*
* The last two assumptions can be relaxed by the addition of helper functions.
*
@@ -2842,9 +2812,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
return false;
/* Anonymous folios pose no problem. */
- mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
if (mapping_flags)
- return mapping_flags & PAGE_MAPPING_ANON;
+ return mapping_flags & FOLIO_MAPPING_ANON;
/*
* At this point, we know the mapping is non-null and points to an
@@ -2892,7 +2862,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
int *nr)
{
struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
+ int ret = 0;
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
@@ -2916,19 +2886,11 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
- if (pte_devmap(pte)) {
- if (unlikely(flags & FOLL_LONGTERM))
- goto pte_unmap;
-
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- goto pte_unmap;
- }
- } else if (pte_special(pte))
+ if (pte_special(pte))
goto pte_unmap;
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ /* If it's not marked as special it must have a valid memmap. */
+ VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
folio = try_grab_folio_fast(page, 1, flags);
@@ -2996,96 +2958,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct folio *folio;
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
-
- if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
-
- folio = try_grab_folio_fast(page, 1, flags);
- if (!folio) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
- folio_set_referenced(folio);
- pages[*nr] = page;
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-
- put_dev_pagemap(pgmap);
- return addr == end;
-}
-
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- unsigned long fault_pfn;
- int nr_start = *nr;
-
- fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
- return 0;
-
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
- }
- return 1;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- unsigned long fault_pfn;
- int nr_start = *nr;
-
- fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
- return 0;
-
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
- }
- return 1;
-}
-#else
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- BUILD_BUG();
- return 0;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- BUILD_BUG();
- return 0;
-}
-#endif
-
static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3100,13 +2972,6 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (pmd_special(orig))
return 0;
- if (pmd_devmap(orig)) {
- if (unlikely(flags & FOLL_LONGTERM))
- return 0;
- return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
- pages, nr);
- }
-
page = pmd_page(orig);
refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
@@ -3147,13 +3012,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (pud_special(orig))
return 0;
- if (pud_devmap(orig)) {
- if (unlikely(flags & FOLL_LONGTERM))
- return 0;
- return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
- pages, nr);
- }
-
page = pud_page(orig);
refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
@@ -3181,46 +3039,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
return 1;
}
-static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- int refs;
- struct page *page;
- struct folio *folio;
-
- if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
- return 0;
-
- BUILD_BUG_ON(pgd_devmap(orig));
-
- page = pgd_page(orig);
- refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
-
- folio = try_grab_folio_fast(page, refs, flags);
- if (!folio)
- return 0;
-
- if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!gup_fast_folio_allowed(folio, flags)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- *nr += refs;
- folio_set_referenced(folio);
- return 1;
-}
-
static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3315,12 +3133,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
- if (unlikely(pgd_leaf(pgd))) {
- if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
- pages, nr))
- return;
- } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
- pages, nr))
+ BUILD_BUG_ON(pgd_leaf(pgd));
+ if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
+ pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -3354,8 +3169,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
return 0;
if (gup_flags & FOLL_PIN) {
- seq = raw_read_seqcount(&current->mm->write_protect_seq);
- if (seq & 1)
+ if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
return 0;
}
@@ -3368,7 +3182,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
- * that come from THPs splitting.
+ * that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
@@ -3415,8 +3229,6 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EOVERFLOW;
if (end > TASK_SIZE_MAX)
return -EFAULT;
- if (unlikely(!access_ok((void __user *)start, len)))
- return -EFAULT;
nr_pinned = gup_fast(start, end, gup_flags, pages);
if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
@@ -3658,7 +3470,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
{
unsigned int flags, nr_folios, nr_found;
unsigned int i, pgshift = PAGE_SHIFT;
- pgoff_t start_idx, end_idx, next_idx;
+ pgoff_t start_idx, end_idx;
struct folio *folio = NULL;
struct folio_batch fbatch;
struct hstate *h;
@@ -3708,20 +3520,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
folio = NULL;
}
- next_idx = 0;
for (i = 0; i < nr_found; i++) {
- /*
- * As there can be multiple entries for a
- * given folio in the batch returned by
- * filemap_get_folios_contig(), the below
- * check is to ensure that we pin and return a
- * unique set of folios between start and end.
- */
- if (next_idx &&
- next_idx != folio_index(fbatch.folios[i]))
- continue;
-
- folio = page_folio(&fbatch.folios[i]->page);
+ folio = fbatch.folios[i];
if (try_grab_folio(folio, 1, FOLL_PIN)) {
folio_batch_release(&fbatch);
@@ -3733,7 +3533,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
*offset = offset_in_folio(folio, start);
folios[nr_folios] = folio;
- next_idx = folio_next_index(folio);
if (++nr_folios == max_folios)
break;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 7e0229ae4a5a..d545e2494994 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -173,6 +183,7 @@ static inline unsigned long hmm_pfn_flags_order(unsigned long order)
return order << HMM_PFN_ORDER_SHIFT;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
pmd_t pmd)
{
@@ -183,7 +194,6 @@ static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[],
pmd_t pmd)
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -248,21 +260,19 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* just report the PFN.
*/
if (is_device_private_entry(entry) &&
- pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -292,23 +302,22 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
goto fault;
/*
- * Bypass devmap pte such as DAX page when all pfn requested
- * flags(pfn_req_flags) are fulfilled.
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
if (!vm_normal_page(walk->vma, addr, pte) &&
- !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -351,7 +360,7 @@ again:
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
- if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
+ if (pmd_trans_huge(pmd)) {
/*
* No need to take pmd_lock here, even if some other thread
* is splitting the huge pmd we will get that event through
@@ -362,7 +371,7 @@ again:
* values.
*/
pmd = pmdp_get_lockless(pmdp);
- if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ if (!pmd_trans_huge(pmd))
goto again;
return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
@@ -396,8 +405,7 @@ again:
return 0;
}
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
- defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
pud_t pud)
{
@@ -429,7 +437,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
return hmm_vma_walk_hole(start, end, -1, walk);
}
- if (pud_leaf(pud) && pud_devmap(pud)) {
+ if (pud_leaf(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
unsigned long *hmm_pfns;
@@ -448,8 +456,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +517,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +619,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index db64116a4f84..5d63ee72c1f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -22,7 +22,6 @@
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
-#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
@@ -99,7 +98,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
}
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags,
+ vm_flags_t vm_flags,
unsigned long tva_flags,
unsigned long orders)
{
@@ -166,7 +165,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_allowable_huge_orders(file_inode(vma->vm_file),
+ return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
!enforce_sysfs);
@@ -617,6 +616,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
+DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -637,6 +638,8 @@ static struct attribute *anon_stats_attrs[] = {
#ifndef CONFIG_SHMEM
&zswpout_attr.attr,
&swpin_attr.attr,
+ &swpin_fallback_attr.attr,
+ &swpin_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -669,6 +672,8 @@ static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
&zswpout_attr.attr,
&swpin_attr.attr,
+ &swpin_fallback_attr.attr,
+ &swpin_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -1197,7 +1202,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
{
pmd_t entry;
- entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
@@ -1303,10 +1308,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return;
- entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1369,20 +1371,31 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return __do_huge_pmd_anonymous_page(vmf);
}
-static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
- pgtable_t pgtable)
+struct folio_or_pfn {
+ union {
+ struct folio *folio;
+ unsigned long pfn;
+ };
+ bool is_folio;
+};
+
+static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
+ bool write, pgtable_t pgtable)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
- spinlock_t *ptl;
- ptl = pmd_lock(mm, pmd);
+ lockdep_assert_held(pmd_lockptr(mm, pmd));
+
if (!pmd_none(*pmd)) {
+ const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+ fop.pfn;
+
if (write) {
- if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+ if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1390,14 +1403,19 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
- goto out_unlock;
+ return -EEXIST;
}
- entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
- if (pfn_t_devmap(pfn))
- entry = pmd_mkdevmap(entry);
- else
+ if (fop.is_folio) {
+ entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
+
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ } else {
+ entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
+ }
if (write) {
entry = pmd_mkyoung(pmd_mkdirty(entry));
entry = maybe_pmd_mkwrite(entry, vma);
@@ -1406,16 +1424,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
- pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
-
-out_unlock:
- spin_unlock(ptl);
- if (pgtable)
- pte_free(mm, pgtable);
+ return 0;
}
/**
@@ -1428,20 +1441,25 @@ out_unlock:
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
+ bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ struct folio_or_pfn fop = {
+ .pfn = pfn,
+ };
pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
+ int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- !pfn_t_devmap(pfn));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1455,13 +1473,56 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
return VM_FAULT_OOM;
}
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
+
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
+ pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(vma->vm_mm, pgtable);
- insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct mm_struct *mm = vma->vm_mm;
+ struct folio_or_pfn fop = {
+ .folio = folio,
+ .is_folio = true,
+ };
+ spinlock_t *ptl;
+ pgtable_t pgtable = NULL;
+ int error;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(mm, vmf->pmd);
+ error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
+ write, pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(mm, pgtable);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
@@ -1470,41 +1531,43 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
- pgprot_t prot = vma->vm_page_prot;
pud_t entry;
- spinlock_t *ptl;
- ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
+ const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+ fop.pfn;
+
if (write) {
- if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
- goto out_unlock;
+ if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
+ return;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- goto out_unlock;
+ return;
}
- entry = pud_mkhuge(pfn_t_pud(pfn, prot));
- if (pfn_t_devmap(pfn))
- entry = pud_mkdevmap(entry);
- else
+ if (fop.is_folio) {
+ entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
+
+ folio_get(fop.folio);
+ folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
+ } else {
+ entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
entry = pud_mkspecial(entry);
+ }
if (write) {
entry = pud_mkyoung(pud_mkdirty(entry));
entry = maybe_pud_mkwrite(entry, vma);
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
-
-out_unlock:
- spin_unlock(ptl);
}
/**
@@ -1517,19 +1580,23 @@ out_unlock:
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
+ bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ struct folio_or_pfn fop = {
+ .pfn = pfn,
+ };
+ spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- !pfn_t_devmap(pfn));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1537,12 +1604,50 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
+
+ ptl = pud_lock(vma->vm_mm, vmf->pud);
+ insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
+ spin_unlock(ptl);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
+ * @vmf: Structure describing the fault
+ * @folio: folio to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PUD_MASK;
+ pud_t *pud = vmf->pud;
+ struct mm_struct *mm = vma->vm_mm;
+ struct folio_or_pfn fop = {
+ .folio = folio,
+ .is_folio = true,
+ };
+ spinlock_t *ptl;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
+ insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
+ spin_unlock(ptl);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1558,46 +1663,6 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
- unsigned long pfn = pmd_pfn(*pmd);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int ret;
-
- assert_spin_locked(pmd_lockptr(mm, pmd));
-
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
- return NULL;
-
- if (pmd_present(*pmd) && pmd_devmap(*pmd))
- /* pass */;
- else
- return NULL;
-
- if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
- /*
- * device mapped pages can only be returned if the
- * caller will manage the page reference count.
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- *pgmap = get_dev_pagemap(pfn, *pgmap);
- if (!*pgmap)
- return ERR_PTR(-EFAULT);
- page = pfn_to_page(pfn);
- ret = try_grab_folio(page_folio(page), 1, flags);
- if (ret)
- page = ERR_PTR(ret);
-
- return page;
-}
-
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1692,13 +1757,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_folio = page_folio(src_page);
folio_get(src_folio);
- if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
/* Page maybe pinned: split and retry the fault on PTEs. */
folio_put(src_folio);
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false);
return -EAGAIN;
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1749,7 +1814,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pud = *src_pud;
- if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+ if (unlikely(!pud_trans_huge(pud)))
goto out_unlock;
/*
@@ -1920,7 +1985,7 @@ unlock_fallback:
folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -2003,7 +2068,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
spin_unlock(vmf->ptl);
writable = false;
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
@@ -2065,7 +2130,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto out;
if (!folio_trylock(folio))
@@ -2135,12 +2200,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else if (is_huge_zero_pmd(orig_pmd)) {
- zap_deposited_table(tlb->mm, pmd);
+ if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
struct folio *folio = NULL;
@@ -2171,6 +2237,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
+
+ /*
+ * Use flush_needed to indicate whether the PMD entry
+ * is present, instead of checking pmd_present() again.
+ */
+ if (flush_needed && pmd_young(orig_pmd) &&
+ likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
}
spin_unlock(ptl);
@@ -2564,12 +2638,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
} else {
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
}
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
@@ -2602,8 +2676,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
- pmd_devmap(*pmd)))
+ if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2620,7 +2693,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
spinlock_t *ptl;
ptl = pud_lock(vma->vm_mm, pud);
- if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+ if (likely(pud_trans_huge(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2640,12 +2713,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
- /* No support for anonymous PUD pages yet */
- BUG();
+ struct page *page = NULL;
+ struct folio *folio;
+
+ /* No support for anonymous PUD pages or migration yet */
+ VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
+ !pud_present(orig_pud));
+
+ page = pud_page(orig_pud);
+ folio = page_folio(page);
+ folio_remove_rmap_pud(folio, page, vma);
+ add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
}
return 1;
}
@@ -2653,14 +2738,33 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
+ struct folio *folio;
+ struct page *page;
+ pud_t old_pud;
+
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
- VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+ VM_BUG_ON(!pud_trans_huge(*pud));
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush(vma, haddr, pud);
+ old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+
+ if (!vma_is_dax(vma))
+ return;
+
+ page = pud_page(old_pud);
+ folio = page_folio(page);
+
+ if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+ folio_mark_dirty(folio);
+ if (!folio_test_referenced(folio) && pud_young(old_pud))
+ folio_set_referenced(folio);
+ folio_remove_rmap_pud(folio, page, vma);
+ folio_put(folio);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio),
+ -HPAGE_PUD_NR);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -2674,7 +2778,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
- if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+ if (unlikely(!pud_trans_huge(*pud)))
goto out;
__split_huge_pud_locked(vma, pud, range.start);
@@ -2747,8 +2851,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
- && !pmd_devmap(*pmd));
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2760,13 +2863,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_special_huge(vma))
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
folio = pfn_swap_entry_folio(entry);
+ } else if (is_huge_zero_pmd(old_pmd)) {
+ return;
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -2951,28 +3056,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, bool freeze, struct folio *folio)
+ pmd_t *pmd, bool freeze)
{
- VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
- VM_BUG_ON(freeze && !folio);
-
- /*
- * When the caller requests to set up a migration entry, we
- * require a folio to check the PMD against. Otherwise, there
- * is a risk of replacing the wrong folio.
- */
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- if (folio && folio != pmd_folio(*pmd))
- return;
+ if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
@@ -2982,20 +3074,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
- split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
+ split_huge_pmd_locked(vma, range.start, pmd, freeze);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct folio *folio)
+ bool freeze)
{
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
if (!pmd)
return;
- __split_huge_pmd(vma, pmd, address, freeze, folio);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -3007,13 +3099,13 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
ALIGN(address, HPAGE_PMD_SIZE)))
- split_huge_pmd_address(vma, address, false, NULL);
+ split_huge_pmd_address(vma, address, false);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
+ unsigned long start,
+ unsigned long end,
+ struct vm_area_struct *next)
{
/* Check if we need to split start first. */
split_huge_pmd_if_needed(vma, start);
@@ -3021,16 +3113,9 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/* Check if we need to split end next. */
split_huge_pmd_if_needed(vma, end);
- /*
- * If we're also updating the next vma vm_start,
- * check if we need to split it.
- */
- if (adjust_next > 0) {
- struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
- unsigned long nstart = next->vm_start;
- nstart += adjust_next;
- split_huge_pmd_if_needed(next, nstart);
- }
+ /* If we're incrementing next->vm_start, we might need to split it. */
+ if (next)
+ split_huge_pmd_if_needed(next, end);
}
static void unmap_folio(struct folio *folio)
@@ -3064,8 +3149,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
int ref_count, map_count;
pmd_t orig_pmd = *pmdp;
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
return false;
+ }
orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
@@ -3092,8 +3181,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
*
* The only folio refs must be one from isolation plus the rmap(s).
*/
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
- ref_count != map_count + 1) {
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
+ set_pmd_at(mm, addr, pmdp, orig_pmd);
+ return false;
+ }
+
+ if (ref_count != map_count + 1) {
set_pmd_at(mm, addr, pmdp, orig_pmd);
return false;
}
@@ -3113,12 +3209,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
{
VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
- if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
- return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
-
- return false;
+ return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
}
static void remap_page(struct folio *folio, unsigned long nr, int flags)
@@ -3137,350 +3232,418 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags)
}
}
-static void lru_add_page_tail(struct folio *folio, struct page *tail,
+static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
struct lruvec *lruvec, struct list_head *list)
{
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- VM_BUG_ON_FOLIO(PageLRU(tail), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
- get_page(tail);
- list_add_tail(&tail->lru, list);
+ folio_get(new_folio);
+ list_add_tail(&new_folio->lru, list);
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!folio_test_lru(folio));
if (folio_test_unevictable(folio))
- tail->mlock_count = 0;
+ new_folio->mlock_count = 0;
else
- list_add_tail(&tail->lru, &folio->lru);
- SetPageLRU(tail);
+ list_add_tail(&new_folio->lru, &folio->lru);
+ folio_set_lru(new_folio);
}
}
-static void __split_huge_page_tail(struct folio *folio, int tail,
- struct lruvec *lruvec, struct list_head *list,
- unsigned int new_order)
+/* Racy check whether the huge page can be split */
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
- struct page *head = &folio->page;
- struct page *page_tail = head + tail;
- /*
- * Careful: new_folio is not a "real" folio before we cleared PageTail.
- * Don't pass it around before clear_compound_head().
- */
- struct folio *new_folio = (struct folio *)page_tail;
+ int extra_pins;
+
+ /* Additional pins from page cache */
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
+ else
+ extra_pins = folio_nr_pages(folio);
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
+ caller_pins;
+}
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
+{
+ for (; nr_pages; page++, nr_pages--)
+ if (PageHWPoison(page))
+ return true;
+ return false;
+}
+/*
+ * It splits @folio into @new_order folios and copies the @folio metadata to
+ * all the resulting folios.
+ */
+static void __split_folio_to_order(struct folio *folio, int old_order,
+ int new_order)
+{
+ /* Scan poisoned pages when split a poisoned folio to large folios */
+ const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
+ long new_nr_pages = 1 << new_order;
+ long nr_pages = 1 << old_order;
+ long i;
+
+ folio_clear_has_hwpoisoned(folio);
+
+ /* Check first new_nr_pages since the loop below skips them */
+ if (handle_hwpoison &&
+ page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
+ folio_set_has_hwpoisoned(folio);
/*
- * Clone page flags before unfreezing refcount.
- *
- * After successful get_page_unless_zero() might follow flags change,
- * for example lock_page() which set PG_waiters.
- *
- * Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
- * the migration entry instead from where remap_page() will restore it.
- * We can still have PG_anon_exclusive set on effectively unmapped and
- * unreferenced sub-pages of an anonymous THP: we can simply drop
- * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ * Skip the first new_nr_pages, since the new folio from them have all
+ * the flags from the original folio.
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (head->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_swapcache) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_workingset) |
- (1L << PG_locked) |
- (1L << PG_unevictable) |
+ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
+ struct page *new_head = &folio->page + i;
+ /*
+ * Careful: new_folio is not a "real" folio before we cleared PageTail.
+ * Don't pass it around before clear_compound_head().
+ */
+ struct folio *new_folio = (struct folio *)new_head;
+
+ VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
+
+ /*
+ * Clone page flags before unfreezing refcount.
+ *
+ * After successful get_page_unless_zero() might follow flags change,
+ * for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ */
+ new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags |= (folio->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_workingset) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
- (1L << PG_arch_2) |
+ (1L << PG_arch_2) |
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
- (1L << PG_arch_3) |
+ (1L << PG_arch_3) |
#endif
- (1L << PG_dirty) |
- LRU_GEN_MASK | LRU_REFS_MASK));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first and second tail page is replaced by other uses */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
- new_folio->mapping = folio->mapping;
- new_folio->index = folio->index + tail;
+ if (handle_hwpoison &&
+ page_range_has_hwpoisoned(new_head, new_nr_pages))
+ folio_set_has_hwpoisoned(new_folio);
- /*
- * page->private should not be set in tail pages. Fix up and warn once
- * if private is unexpectedly set.
- */
- if (unlikely(page_tail->private)) {
- VM_WARN_ON_ONCE_PAGE(true, page_tail);
- page_tail->private = 0;
- }
- if (folio_test_swapcache(folio))
- new_folio->swap.val = folio->swap.val + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + i;
- /* Page flags must be visible before we make the page non-compound. */
- smp_wmb();
+ /*
+ * page->private should not be set in tail pages. Fix up and warn once
+ * if private is unexpectedly set.
+ */
+ if (unlikely(new_folio->private)) {
+ VM_WARN_ON_ONCE_PAGE(true, new_head);
+ new_folio->private = NULL;
+ }
- /*
- * Clear PageTail before unfreezing page refcount.
- *
- * After successful get_page_unless_zero() might follow put_page()
- * which needs correct compound_head().
- */
- clear_compound_head(page_tail);
- if (new_order) {
- prep_compound_page(page_tail, new_order);
- folio_set_large_rmappable(new_folio);
- }
+ if (folio_test_swapcache(folio))
+ new_folio->swap.val = folio->swap.val + i;
- /* Finally unfreeze refcount. Additional reference from page cache. */
- page_ref_unfreeze(page_tail,
- 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
- folio_nr_pages(new_folio) : 0));
+ /* Page flags must be visible before we make the page non-compound. */
+ smp_wmb();
- if (folio_test_young(folio))
- folio_set_young(new_folio);
- if (folio_test_idle(folio))
- folio_set_idle(new_folio);
+ /*
+ * Clear PageTail before unfreezing page refcount.
+ *
+ * After successful get_page_unless_zero() might follow put_page()
+ * which needs correct compound_head().
+ */
+ clear_compound_head(new_head);
+ if (new_order) {
+ prep_compound_page(new_head, new_order);
+ folio_set_large_rmappable(new_folio);
+ }
+
+ if (folio_test_young(folio))
+ folio_set_young(new_folio);
+ if (folio_test_idle(folio))
+ folio_set_idle(new_folio);
+#ifdef CONFIG_MEMCG
+ new_folio->memcg_data = folio->memcg_data;
+#endif
- folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ }
- /*
- * always add to the tail because some iterators expect new
- * pages to show after the currently processed elements - e.g.
- * migrate_pages
- */
- lru_add_page_tail(folio, page_tail, lruvec, list);
+ if (new_order)
+ folio_set_order(folio, new_order);
+ else
+ ClearPageCompound(&folio->page);
}
-static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned int new_order)
+/*
+ * It splits an unmapped @folio to lower order smaller folios in two ways.
+ * @folio: the to-be-split folio
+ * @new_order: the smallest order of the after split folios (since buddy
+ * allocator like split generates folios with orders from @folio's
+ * order - 1 to new_order).
+ * @split_at: in buddy allocator like split, the folio containing @split_at
+ * will be split until its order becomes @new_order.
+ * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
+ * @mapping: @folio->mapping
+ * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ *
+ *
+ * 1. uniform split: the given @folio into multiple @new_order small folios,
+ * where all small folios have the same order. This is done when
+ * uniform_split is true.
+ * 2. buddy allocator like (non-uniform) split: the given @folio is split into
+ * half and one of the half (containing the given page) is split into half
+ * until the given @page's order becomes @new_order. This is done when
+ * uniform_split is false.
+ *
+ * The high level flow for these two methods are:
+ * 1. uniform split: a single __split_folio_to_order() is called to split the
+ * @folio into @new_order, then we traverse all the resulting folios one by
+ * one in PFN ascending order and perform stats, unfreeze, adding to list,
+ * and file mapping index operations.
+ * 2. non-uniform split: in general, folio_order - @new_order calls to
+ * __split_folio_to_order() are made in a for loop to split the @folio
+ * to one lower order at a time. The resulting small folios are processed
+ * like what is done during the traversal in 1, except the one containing
+ * @page, which is split in next for loop.
+ *
+ * After splitting, the caller's folio reference will be transferred to the
+ * folio containing @page. The caller needs to unlock and/or free after-split
+ * folios if necessary.
+ *
+ * For !uniform_split, when -ENOMEM is returned, the original folio might be
+ * split. The caller needs to check the input folio.
+ */
+static int __split_unmapped_folio(struct folio *folio, int new_order,
+ struct page *split_at, struct xa_state *xas,
+ struct address_space *mapping, bool uniform_split)
{
- struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
- struct lruvec *lruvec;
- struct address_space *swap_cache = NULL;
- unsigned long offset = 0;
- int i, nr_dropped = 0;
- unsigned int new_nr = 1 << new_order;
int order = folio_order(folio);
- unsigned int nr = 1 << order;
+ int start_order = uniform_split ? new_order : order - 1;
+ bool stop_split = false;
+ struct folio *next;
+ int split_order;
+ int ret = 0;
- /* complete memcg works before add pages to LRU */
- split_page_memcg(head, order, new_order);
+ if (folio_test_anon(folio))
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
- offset = swap_cache_index(folio->swap);
- swap_cache = swap_address_space(folio->swap);
- xa_lock(&swap_cache->i_pages);
- }
+ /*
+ * split to new_order one order at a time. For uniform split,
+ * folio is split to new_order directly.
+ */
+ for (split_order = start_order;
+ split_order >= new_order && !stop_split;
+ split_order--) {
+ struct folio *end_folio = folio_next(folio);
+ int old_order = folio_order(folio);
+ struct folio *new_folio;
+
+ /* order-1 anonymous folio is not supported */
+ if (folio_test_anon(folio) && split_order == 1)
+ continue;
+ if (uniform_split && split_order != new_order)
+ continue;
- /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
- lruvec = folio_lruvec_lock(folio);
-
- ClearPageHasHWPoisoned(head);
-
- for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
- struct folio *tail;
- __split_huge_page_tail(folio, i, lruvec, list, new_order);
- tail = page_folio(head + i);
- /* Some pages can be beyond EOF: drop them from page cache */
- if (tail->index >= end) {
- if (shmem_mapping(folio->mapping))
- nr_dropped++;
- else if (folio_test_clear_dirty(tail))
- folio_account_cleaned(tail,
- inode_to_wb(folio->mapping->host));
- __filemap_remove_folio(tail, NULL);
- folio_put(tail);
- } else if (!folio_test_anon(folio)) {
- __xa_store(&folio->mapping->i_pages, tail->index,
- tail, 0);
- } else if (swap_cache) {
- __xa_store(&swap_cache->i_pages, offset + i,
- tail, 0);
+ if (mapping) {
+ /*
+ * uniform split has xas_split_alloc() called before
+ * irq is disabled to allocate enough memory, whereas
+ * non-uniform split can handle ENOMEM.
+ */
+ if (uniform_split)
+ xas_split(xas, folio, old_order);
+ else {
+ xas_set_order(xas, folio->index, split_order);
+ xas_try_split(xas, folio, old_order);
+ if (xas_error(xas)) {
+ ret = xas_error(xas);
+ stop_split = true;
+ }
+ }
}
- }
- if (!new_order)
- ClearPageCompound(head);
- else {
- struct folio *new_folio = (struct folio *)head;
+ if (!stop_split) {
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
- folio_set_order(new_folio, new_order);
- }
- unlock_page_lruvec(lruvec);
- /* Caller disabled irqs, so they are still disabled here */
-
- split_page_owner(head, order, new_order);
- pgalloc_tag_split(folio, order, new_order);
+ __split_folio_to_order(folio, old_order, split_order);
+ }
- /* See comment in __split_huge_page_tail() */
- if (folio_test_anon(folio)) {
- /* Additional pin to swap cache */
- if (folio_test_swapcache(folio)) {
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&swap_cache->i_pages);
- } else {
- folio_ref_inc(folio);
+ /*
+ * Iterate through after-split folios and update folio stats.
+ * But in buddy allocator like split, the folio
+ * containing the specified page is skipped until its order
+ * is new_order, since the folio will be worked on in next
+ * iteration.
+ */
+ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ /*
+ * for buddy allocator like split, new_folio containing
+ * @split_at page could be split again, thus do not
+ * change stats yet. Wait until new_folio's order is
+ * @new_order or stop_split is set to true by the above
+ * xas_split() failure.
+ */
+ if (new_folio == page_folio(split_at)) {
+ folio = new_folio;
+ if (split_order != new_order && !stop_split)
+ continue;
+ }
+ if (folio_test_anon(new_folio))
+ mod_mthp_stat(folio_order(new_folio),
+ MTHP_STAT_NR_ANON, 1);
}
- } else {
- /* Additional pin to page cache */
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&folio->mapping->i_pages);
}
- local_irq_enable();
-
- if (nr_dropped)
- shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
- /*
- * set page to its compound_head when split to non order-0 pages, so
- * we can skip unlocking it below, since PG_locked is transferred to
- * the compound_head of the page and the caller will unlock it.
- */
- if (new_order)
- page = compound_head(page);
-
- for (i = 0; i < nr; i += new_nr) {
- struct page *subpage = head + i;
- struct folio *new_folio = page_folio(subpage);
- if (subpage == page)
- continue;
- folio_unlock(new_folio);
+ return ret;
+}
+bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
+{
+ if (folio_test_anon(folio)) {
+ /* order-1 is not supported for anonymous THP. */
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ if (new_order == 1)
+ return false;
+ } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
/*
- * Subpages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
+ * No split if the file system does not support large folio.
+ * Note that we might still have THPs in such mappings due to
+ * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
+ * does not actually support large folios properly.
*/
- free_page_and_swap_cache(subpage);
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
+
+ /* Only swapping a whole PMD-mapped folio is supported */
+ if (folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
}
+
+ return true;
}
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
+/* See comments in non_uniform_split_supported() */
+bool uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
{
- int extra_pins;
+ if (folio_test_anon(folio)) {
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ if (new_order == 1)
+ return false;
+ } else if (new_order) {
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
+ }
- /* Additional pins from page cache */
- if (folio_test_anon(folio))
- extra_pins = folio_test_swapcache(folio) ?
- folio_nr_pages(folio) : 0;
- else
- extra_pins = folio_nr_pages(folio);
- if (pextra_pins)
- *pextra_pins = extra_pins;
- return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
- caller_pins;
+ if (new_order && folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
}
/*
- * This function splits a large folio into smaller folios of order @new_order.
- * @page can point to any page of the large folio to split. The split operation
- * does not change the position of @page.
- *
- * Prerequisites:
- *
- * 1) The caller must hold a reference on the @page's owning folio, also known
- * as the large folio.
+ * __folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ * @lock_at: a page within @folio to be left locked to caller
+ * @list: after-split folios will be put on it if non NULL
+ * @uniform_split: perform uniform split or not (non-uniform split)
*
- * 2) The large folio must be locked.
+ * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
+ * It is in charge of checking whether the split is supported or not and
+ * preparing @folio for __split_unmapped_folio().
*
- * 3) The folio must not be pinned. Any unexpected folio references, including
- * GUP pins, will result in the folio not getting split; instead, the caller
- * will receive an -EAGAIN.
- *
- * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
- * supported for non-file-backed folios, because folio->_deferred_list, which
- * is used by partially mapped folios, is stored in subpage 2, but an order-1
- * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
- * since they do not use _deferred_list.
- *
- * After splitting, the caller's folio reference will be transferred to @page,
- * resulting in a raised refcount of @page after this call. The other pages may
- * be freed if they are not mapped.
- *
- * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
- *
- * Pages in @new_order will inherit the mapping, flags, and so on from the
- * huge page.
+ * After splitting, the after-split folio containing @lock_at remains locked
+ * and others are unlocked:
+ * 1. for uniform split, @lock_at points to one of @folio's subpages;
+ * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
*
- * Returns 0 if the huge page was split successfully.
- *
- * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
- * the folio was concurrently removed from the page cache.
- *
- * Returns -EBUSY when trying to split the huge zeropage, if the folio is
- * under writeback, if fs-specific folio metadata cannot currently be
- * released, or if some unexpected race happened (e.g., anon VMA disappeared,
- * truncation).
- *
- * Callers should ensure that the order respects the address space mapping
- * min-order if one is set for non-anonymous folios.
- *
- * Returns -EINVAL when trying to split to an order that is incompatible
- * with the folio. Splitting to order 0 is compatible with all folios.
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
- unsigned int new_order)
+static int __folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, bool uniform_split)
{
- struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
- /* reset xarray order to new order after split */
- XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+ struct folio *end_folio = folio_next(folio);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
int order = folio_order(folio);
+ struct folio *new_folio, *next;
+ int nr_shmem_dropped = 0;
+ int remap_flags = 0;
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
+
+ if (folio != page_folio(split_at) || folio != page_folio(lock_at))
+ return -EINVAL;
+
+ /*
+ * Folios that just got truncated cannot get split. Signal to the
+ * caller that there was a race.
+ *
+ * TODO: this will also currently refuse shmem folios that are in the
+ * swapcache.
+ */
+ if (!is_anon && !folio->mapping)
+ return -EBUSY;
if (new_order >= folio_order(folio))
return -EINVAL;
- if (is_anon) {
- /* order-1 is not supported for anonymous THP. */
- if (new_order == 1) {
- VM_WARN_ONCE(1, "Cannot split to order-1 folio");
- return -EINVAL;
- }
- } else if (new_order) {
- /* Split shmem folio to non-zero order not supported */
- if (shmem_mapping(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split shmem folio to non-0 order");
- return -EINVAL;
- }
- /*
- * No split if the file system does not support large folio.
- * Note that we might still have THPs in such mappings due to
- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
- * does not actually support large folios properly.
- */
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split file folio to non-0 order");
- return -EINVAL;
- }
- }
+ if (uniform_split && !uniform_split_supported(folio, new_order, true))
+ return -EINVAL;
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio) && new_order)
+ if (!uniform_split &&
+ !non_uniform_split_supported(folio, new_order, true))
return -EINVAL;
is_hzp = is_huge_zero_folio(folio);
@@ -3506,7 +3669,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
ret = -EBUSY;
goto out;
}
- end = -1;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -3514,17 +3676,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
gfp_t gfp;
mapping = folio->mapping;
-
- /* Truncated ? */
- if (!mapping) {
- ret = -EBUSY;
- goto out;
- }
-
min_order = mapping_min_folio_order(folio->mapping);
if (new_order < min_order) {
- VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
- min_order);
ret = -EINVAL;
goto out;
}
@@ -3537,21 +3690,24 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
goto out;
}
- xas_split_alloc(&xas, folio, folio_order(folio), gfp);
- if (xas_error(&xas)) {
- ret = xas_error(&xas);
- goto out;
+ if (uniform_split) {
+ xas_set_order(&xas, folio->index, new_order);
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
}
anon_vma = NULL;
i_mmap_lock_read(mapping);
/*
- *__split_huge_page() may need to trim off pages beyond EOF:
- * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
- * which cannot be nested inside the page tree lock. So note
- * end now: i_size itself may be changed at any moment, but
- * folio lock is good enough to serialize the trimming.
+ *__split_unmapped_folio() may need to trim off pages beyond
+ * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
+ * seqlock, which cannot be nested inside the page tree lock.
+ * So note end now: i_size itself may be changed at any moment,
+ * but folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -3578,13 +3734,19 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != folio)
+ if (xas_load(&xas) != folio) {
+ ret = -EAGAIN;
goto fail;
+ }
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ struct address_space *swap_cache = NULL;
+ struct lruvec *lruvec;
+ int expected_refs;
+
if (folio_order(folio) > 1 &&
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
@@ -3605,7 +3767,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (mapping) {
int nr = folio_nr_pages(folio);
- xas_split(&xas, folio, folio_order(folio));
if (folio_test_pmd_mappable(folio) &&
new_order < HPAGE_PMD_ORDER) {
if (folio_test_swapbacked(folio)) {
@@ -3619,21 +3780,122 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
}
}
- if (is_anon) {
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
+ if (folio_test_swapcache(folio)) {
+ if (mapping) {
+ VM_WARN_ON_ONCE_FOLIO(mapping, folio);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ swap_cache = swap_address_space(folio->swap);
+ xa_lock(&swap_cache->i_pages);
}
- __split_huge_page(page, list, end, new_order);
- ret = 0;
+
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+ lruvec = folio_lruvec_lock(folio);
+
+ ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
+ mapping, uniform_split);
+
+ /*
+ * Unfreeze after-split folios and put them back to the right
+ * list. @folio should be kept frozon until page cache
+ * entries are updated with all the other after-split folios
+ * to prevent others seeing stale page cache entries.
+ * As a result, new_folio starts from the next folio of
+ * @folio.
+ */
+ for (new_folio = folio_next(folio); new_folio != end_folio;
+ new_folio = next) {
+ unsigned long nr_pages = folio_nr_pages(new_folio);
+
+ next = folio_next(new_folio);
+
+ expected_refs = folio_expected_ref_count(new_folio) + 1;
+ folio_ref_unfreeze(new_folio, expected_refs);
+
+ lru_add_split_folio(folio, new_folio, lruvec, list);
+
+ /*
+ * Anonymous folio with swap cache.
+ * NOTE: shmem in swap cache is not supported yet.
+ */
+ if (swap_cache) {
+ __xa_store(&swap_cache->i_pages,
+ swap_cache_index(new_folio->swap),
+ new_folio, 0);
+ continue;
+ }
+
+ /* Anonymous folio without swap cache */
+ if (!mapping)
+ continue;
+
+ /* Add the new folio to the page cache. */
+ if (new_folio->index < end) {
+ __xa_store(&mapping->i_pages, new_folio->index,
+ new_folio, 0);
+ continue;
+ }
+
+ /* Drop folio beyond EOF: ->index >= end */
+ if (shmem_mapping(mapping))
+ nr_shmem_dropped += nr_pages;
+ else if (folio_test_clear_dirty(new_folio))
+ folio_account_cleaned(
+ new_folio, inode_to_wb(mapping->host));
+ __filemap_remove_folio(new_folio, NULL);
+ folio_put_refs(new_folio, nr_pages);
+ }
+ /*
+ * Unfreeze @folio only after all page cache entries, which
+ * used to point to it, have been updated with new folios.
+ * Otherwise, a parallel folio_try_get() can grab @folio
+ * and its caller can see stale page cache entries.
+ */
+ expected_refs = folio_expected_ref_count(folio) + 1;
+ folio_ref_unfreeze(folio, expected_refs);
+
+ unlock_page_lruvec(lruvec);
+
+ if (swap_cache)
+ xa_unlock(&swap_cache->i_pages);
} else {
spin_unlock(&ds_queue->split_queue_lock);
-fail:
- if (mapping)
- xas_unlock(&xas);
- local_irq_enable();
- remap_page(folio, folio_nr_pages(folio), 0);
ret = -EAGAIN;
}
+fail:
+ if (mapping)
+ xas_unlock(&xas);
+
+ local_irq_enable();
+
+ if (nr_shmem_dropped)
+ shmem_uncharge(mapping->host, nr_shmem_dropped);
+
+ if (!ret && is_anon)
+ remap_flags = RMP_USE_SHARED_ZEROPAGE;
+ remap_page(folio, 1 << order, remap_flags);
+
+ /*
+ * Unlock all after-split folios except the one containing
+ * @lock_at page. If @folio is not split, it will be kept locked.
+ */
+ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ if (new_folio == page_folio(lock_at))
+ continue;
+
+ folio_unlock(new_folio);
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ free_folio_and_swap_cache(new_folio);
+ }
out_unlock:
if (anon_vma) {
@@ -3650,6 +3912,90 @@ out:
return ret;
}
+/*
+ * This function splits a large folio into smaller folios of order @new_order.
+ * @page can point to any page of the large folio to split. The split operation
+ * does not change the position of @page.
+ *
+ * Prerequisites:
+ *
+ * 1) The caller must hold a reference on the @page's owning folio, also known
+ * as the large folio.
+ *
+ * 2) The large folio must be locked.
+ *
+ * 3) The folio must not be pinned. Any unexpected folio references, including
+ * GUP pins, will result in the folio not getting split; instead, the caller
+ * will receive an -EAGAIN.
+ *
+ * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
+ * supported for non-file-backed folios, because folio->_deferred_list, which
+ * is used by partially mapped folios, is stored in subpage 2, but an order-1
+ * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
+ * since they do not use _deferred_list.
+ *
+ * After splitting, the caller's folio reference will be transferred to @page,
+ * resulting in a raised refcount of @page after this call. The other pages may
+ * be freed if they are not mapped.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Pages in @new_order will inherit the mapping, flags, and so on from the
+ * huge page.
+ *
+ * Returns 0 if the huge page was split successfully.
+ *
+ * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
+ * the folio was concurrently removed from the page cache.
+ *
+ * Returns -EBUSY when trying to split the huge zeropage, if the folio is
+ * under writeback, if fs-specific folio metadata cannot currently be
+ * released, or if some unexpected race happened (e.g., anon VMA disappeared,
+ * truncation).
+ *
+ * Callers should ensure that the order respects the address space mapping
+ * min-order if one is set for non-anonymous folios.
+ *
+ * Returns -EINVAL when trying to split to an order that is incompatible
+ * with the folio. Splitting to order 0 is compatible with all folios.
+ */
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
+{
+ struct folio *folio = page_folio(page);
+
+ return __folio_split(folio, new_order, &folio->page, page, list, true);
+}
+
+/*
+ * folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ *
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
+ *
+ * It has the same prerequisites and returns as
+ * split_huge_page_to_list_to_order().
+ *
+ * Split a folio at @split_at to a new_order folio, leave the
+ * remaining subpages of the original folio as large as possible. For example,
+ * in the case of splitting an order-9 folio at its third order-3 subpages to
+ * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
+ * After the split, there will be a group of folios with different orders and
+ * the new folio containing @split_at is marked in bracket:
+ * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
+ *
+ * After split, folio is left locked for caller.
+ */
+int folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct list_head *list)
+{
+ return __folio_split(folio, new_order, split_at, &folio->page, list,
+ false);
+}
+
int min_order_for_split(struct folio *folio)
{
if (folio_test_anon(folio))
@@ -3666,12 +4012,7 @@ int min_order_for_split(struct folio *folio)
int split_folio_to_list(struct folio *folio, struct list_head *list)
{
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- return split_huge_page_to_list_to_order(&folio->page, list, ret);
+ return split_huge_page_to_list_to_order(&folio->page, list, 0);
}
/*
@@ -3734,7 +4075,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
/*
* Exclude swapcache: originally to avoid a corrupt deferred split
- * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * queue. Nowadays that is fully prevented by memcg1_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
@@ -3784,32 +4125,26 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
static bool thp_underused(struct folio *folio)
{
int num_zero_pages = 0, num_filled_pages = 0;
- void *kaddr;
int i;
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return false;
+ if (folio_contain_hwpoisoned_page(folio))
+ return false;
+
for (i = 0; i < folio_nr_pages(folio); i++) {
- kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
- if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
- num_zero_pages++;
- if (num_zero_pages > khugepaged_max_ptes_none) {
- kunmap_local(kaddr);
+ if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) {
+ if (++num_zero_pages > khugepaged_max_ptes_none)
return true;
- }
} else {
/*
* Another path for early exit once the number
* of non-zero filled pages exceeds threshold.
*/
- num_filled_pages++;
- if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
- kunmap_local(kaddr);
+ if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none)
return false;
- }
}
- kunmap_local(kaddr);
}
return false;
}
@@ -3969,7 +4304,8 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
}
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
- unsigned long vaddr_end, unsigned int new_order)
+ unsigned long vaddr_end, unsigned int new_order,
+ long in_folio_offset)
{
int ret = 0;
struct task_struct *task;
@@ -4053,8 +4389,16 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!folio_test_anon(folio) && folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 ||
+ in_folio_offset >= folio_nr_pages(folio)) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
@@ -4077,7 +4421,8 @@ out:
}
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
- pgoff_t off_end, unsigned int new_order)
+ pgoff_t off_end, unsigned int new_order,
+ long in_folio_offset)
{
struct filename *file;
struct file *candidate;
@@ -4126,8 +4471,15 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
folio_unlock(folio);
@@ -4160,6 +4512,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
int pid;
unsigned long vaddr_start, vaddr_end;
unsigned int new_order = 0;
+ long in_folio_offset = -1;
ret = mutex_lock_interruptible(&split_debug_mutex);
if (ret)
@@ -4175,42 +4528,46 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
if (input_buf[0] == '/') {
char *tok;
- char *buf = input_buf;
+ char *tok_buf = input_buf;
char file_path[MAX_INPUT_BUF_SZ];
pgoff_t off_start = 0, off_end = 0;
size_t input_len = strlen(input_buf);
- tok = strsep(&buf, ",");
- if (tok && buf) {
+ tok = strsep(&tok_buf, ",");
+ if (tok && tok_buf) {
strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
}
- ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
- if (ret != 2 && ret != 3) {
+ ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
+ &new_order, &in_folio_offset);
+ if (ret != 2 && ret != 3 && ret != 4) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
+ ret = split_huge_pages_in_file(file_path, off_start, off_end,
+ new_order, in_folio_offset);
if (!ret)
ret = input_len;
goto out;
}
- ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
+ &vaddr_end, &new_order, &in_folio_offset);
if (ret == 1 && pid == 1) {
split_huge_pages_all();
ret = strlen(input_buf);
goto out;
- } else if (ret != 3 && ret != 4) {
+ } else if (ret != 3 && ret != 4 && ret != 5) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
+ in_folio_offset);
if (!ret)
ret = strlen(input_buf);
out:
@@ -4299,7 +4656,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
- pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+ pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 15ae955c7cbc..ef8b841286aa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,15 +14,18 @@
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
+#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
+#include <linux/minmax.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -40,6 +43,7 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/setup.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
@@ -48,18 +52,34 @@
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
+#include "hugetlb_cma.h"
+#include <linux/page-isolation.h>
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
-#ifdef CONFIG_CMA
-static struct cma *hugetlb_cma[MAX_NUMNODES];
-static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-#endif
-static unsigned long hugetlb_cma_size __initdata;
-
+__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
+static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
+
+/*
+ * Due to ordering constraints across the init code for various
+ * architectures, hugetlb hstate cmdline parameters can't simply
+ * be early_param. early_param might call the setup function
+ * before valid hugetlb page sizes are determined, leading to
+ * incorrect rejection of valid hugepagesz= options.
+ *
+ * So, record the parameters early and consume them whenever the
+ * init code is ready for them, by calling hugetlb_parse_params().
+ */
+
+/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
+#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1)
+struct hugetlb_cmdline {
+ char *val;
+ int (*setup)(char *val);
+};
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
@@ -67,6 +87,21 @@ static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
+static unsigned long hugepage_allocation_threads __initdata;
+
+static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
+static int hstate_cmdline_index __initdata;
+static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
+static int hugetlb_param_index __initdata;
+static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
+static __init void hugetlb_parse_params(void);
+
+#define hugetlb_early_param(str, func) \
+static __init int func##args(char *s) \
+{ \
+ return hugetlb_add_param(s, func); \
+} \
+early_param(str, func##args)
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -87,17 +122,16 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
{
-#ifdef CONFIG_CMA
- int nid = folio_nid(folio);
-
- if (cma_free_folio(hugetlb_cma[nid], folio))
+ if (folio_test_hugetlb_cma(folio)) {
+ hugetlb_cma_free_folio(folio);
return;
-#endif
+ }
+
folio_put(folio);
}
@@ -251,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
return ret;
}
-static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
-{
- return HUGETLBFS_SB(inode->i_sb)->spool;
-}
-
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
@@ -1218,7 +1247,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -1246,69 +1275,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
hugetlb_dup_vma_private(vma);
}
-/* Returns true if the VMA has associated reserve pages */
-static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
-{
- if (vma->vm_flags & VM_NORESERVE) {
- /*
- * This address is already reserved by other process(chg == 0),
- * so, we should decrement reserved count. Without decrementing,
- * reserve count remains after releasing inode, because this
- * allocated page will go into page cache and is regarded as
- * coming from reserved pool in releasing step. Currently, we
- * don't have any other solution to deal with this situation
- * properly, so add work-around here.
- */
- if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return true;
- else
- return false;
- }
-
- /* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE) {
- /*
- * We know VM_NORESERVE is not set. Therefore, there SHOULD
- * be a region map for all pages. The only situation where
- * there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reserves to
- * use. This situation is indicated if chg != 0.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Like the shared case above, a hole punch or truncate
- * could have been performed on the private mapping.
- * Examine the value of chg to determine if reserves
- * actually exist or were previously consumed.
- * Very Subtle - The value of chg comes from a previous
- * call to vma_needs_reserves(). The reserve map for
- * private mappings has different (opposite) semantics
- * than that of shared mappings. vma_needs_reserves()
- * has already taken this difference in semantics into
- * account. Therefore, the meaning of chg is the same
- * as in the shared case above. Code could easily be
- * combined, but keeping it separate draws attention to
- * subtle differences.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- return false;
-}
-
static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
int nid = folio_nid(folio);
@@ -1336,6 +1302,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
if (folio_test_hwpoison(folio))
continue;
+ if (is_migrate_isolate_page(&folio->page))
+ continue;
+
list_move(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
folio_clear_hugetlb_freed(folio);
@@ -1394,7 +1363,7 @@ static unsigned long available_huge_pages(struct hstate *h)
static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, long chg)
+ unsigned long address, long gbl_chg)
{
struct folio *folio = NULL;
struct mempolicy *mpol;
@@ -1403,11 +1372,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
int nid;
/*
- * A child process with MAP_PRIVATE mappings created by their parent
- * have no page reserves. This check ensures that reservations are
- * not "stolen". The child may still get SIGKILLed
+ * gbl_chg==1 means the allocation requires a new page that was not
+ * reserved before. Making sure there's at least one free page.
*/
- if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
+ if (gbl_chg && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1425,11 +1393,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
- if (folio && vma_has_reserves(vma, chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
-
mpol_cond_put(mpol);
return folio;
@@ -1520,27 +1483,11 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
retry:
- folio = NULL;
-#ifdef CONFIG_CMA
- {
- int node;
-
- if (hugetlb_cma[nid])
- folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
-
- if (!folio && !(gfp_mask & __GFP_THISNODE)) {
- for_each_node_mask(node, *nodemask) {
- if (node == nid || !hugetlb_cma[node])
- continue;
-
- folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
- if (folio)
- break;
- }
- }
- }
-#endif
+ folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
if (!folio) {
+ if (hugetlb_cma_exclusive_alloc())
+ return NULL;
+
folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
if (!folio)
return NULL;
@@ -1699,7 +1646,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
folio_ref_unfreeze(folio, 1);
- INIT_LIST_HEAD(&folio->_deferred_list);
hugetlb_free_folio(folio);
}
@@ -2001,7 +1947,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;
- bool retry = true;
/*
* By default we always try hard to allocate the folio with
@@ -2016,22 +1961,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
-retry:
- folio = __folio_alloc(gfp_mask, order, nid, nmask);
- /* Ensure hugetlb folio won't have large_rmappable flag set. */
- if (folio)
- folio_clear_large_rmappable(folio);
- if (folio && !folio_ref_freeze(folio, 1)) {
- folio_put(folio);
- if (retry) { /* retry once */
- retry = false;
- goto retry;
- }
- /* WOW! twice in a row. */
- pr_warn("HugeTLB unexpected inflated folio ref count\n");
- folio = NULL;
- }
+ folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
@@ -2200,6 +2131,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2222,7 +2155,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2242,7 +2177,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
@@ -2306,12 +2241,21 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
return NULL;
+ hugetlb_vmemmap_optimize_folio(h, folio);
+
spin_lock_irq(&hugetlb_lock);
/*
+ * nr_huge_pages needs to be adjusted within the same lock cycle
+ * as surplus_pages, otherwise it might confuse
+ * persistent_huge_pages() momentarily.
+ */
+ __prep_account_new_huge_page(h, folio_nid(folio));
+
+ /*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
@@ -2392,12 +2336,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
struct folio *folio;
spin_lock_irq(&hugetlb_lock);
+ if (!h->resv_huge_pages) {
+ spin_unlock_irq(&hugetlb_lock);
+ return NULL;
+ }
+
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
nmask);
- if (folio) {
- VM_BUG_ON(!h->resv_huge_pages);
+ if (folio)
h->resv_huge_pages--;
- }
spin_unlock_irq(&hugetlb_lock);
return folio;
@@ -2457,8 +2404,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long i;
long needed, allocated;
bool alloc_ok = true;
- int node;
- nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ nodemask_t *mbind_nodemask, alloc_nodemask;
+
+ mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ if (mbind_nodemask)
+ nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
+ else
+ alloc_nodemask = cpuset_current_mems_allowed;
lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2474,14 +2426,13 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
folio = NULL;
- for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- node, NULL);
- if (folio)
- break;
- }
- }
+
+ /*
+ * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
+ * down the road to pick the current node if that is the case.
+ */
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, &alloc_nodemask);
if (!folio) {
alloc_ok = false;
break;
@@ -2835,20 +2786,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
* the old one
- * @h: struct hstate old page belongs to
* @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
- struct folio *old_folio, struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
+ struct list_head *list)
{
- gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ gfp_t gfp_mask;
+ struct hstate *h;
int nid = folio_nid(old_folio);
struct folio *new_folio = NULL;
int ret = 0;
retry:
+ /*
+ * The old_folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
/*
@@ -2863,7 +2818,7 @@ retry:
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- isolated = isolate_hugetlb(old_folio, list);
+ isolated = folio_isolate_hugetlb(old_folio, list);
ret = isolated ? 0 : -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
@@ -2877,8 +2832,10 @@ retry:
cond_resched();
goto retry;
} else {
+ h = folio_hstate(old_folio);
if (!new_folio) {
spin_unlock_irq(&hugetlb_lock);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
NULL, NULL);
if (!new_folio)
@@ -2920,94 +2877,154 @@ free_new:
return ret;
}
-int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
- struct hstate *h;
- struct folio *folio = page_folio(page);
int ret = -EBUSY;
- /*
- * The page might have been dissolved from under our feet, so make sure
- * to carefully check the state under the lock.
- * Return success when racing as if we dissolved the page ourselves.
- */
- spin_lock_irq(&hugetlb_lock);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- spin_unlock_irq(&hugetlb_lock);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!folio_test_hugetlb(folio))
return 0;
- }
- spin_unlock_irq(&hugetlb_lock);
/*
* Fence off gigantic pages as there is a cyclic dependency between
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
- if (hstate_is_gigantic(h))
+ if (folio_order(folio) > MAX_PAGE_ORDER)
return -ENOMEM;
- if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
+ if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
- ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
+ ret = alloc_and_dissolve_hugetlb_folio(folio, list);
return ret;
}
+/*
+ * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
+ * range with new folios.
+ * @start_pfn: start pfn of the given pfn range
+ * @end_pfn: end pfn of the given pfn range
+ * Returns 0 on success, otherwise negated error.
+ */
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct folio *folio;
+ int ret = 0;
+
+ LIST_HEAD(isolate_list);
+
+ while (start_pfn < end_pfn) {
+ folio = pfn_folio(start_pfn);
+
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
+ if (ret)
+ break;
+
+ putback_movable_pages(&isolate_list);
+ }
+ start_pfn++;
+ }
+
+ return ret;
+}
+
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
+typedef enum {
+ /*
+ * For either 0/1: we checked the per-vma resv map, and one resv
+ * count either can be reused (0), or an extra needed (1).
+ */
+ MAP_CHG_REUSE = 0,
+ MAP_CHG_NEEDED = 1,
+ /*
+ * Cannot use per-vma resv count can be used, hence a new resv
+ * count is enforced.
+ *
+ * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
+ * that currently vma_needs_reservation() has an unwanted side
+ * effect to either use end() or commit() to complete the
+ * transaction. Hence it needs to differenciate from NEEDED.
+ */
+ MAP_CHG_ENFORCED = 2,
+} map_chg_state;
+
+/*
+ * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
+ * faults of hugetlb private mappings on top of a non-page-cache folio (in
+ * which case even if there's a private vma resv map it won't cover such
+ * allocation). New call sites should (probably) never set it to true!!
+ * When it's set, the allocation will bypass all vma level reservations.
+ */
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
+ unsigned long addr, bool cow_from_owner)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
- long gbl_chg;
- int memcg_charge_ret, ret, idx;
+ long retval, gbl_chg, gbl_reserve;
+ map_chg_state map_chg;
+ int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
- struct mem_cgroup *memcg;
- bool deferred_reserve;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
- memcg = get_mem_cgroup_from_current();
- memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
- if (memcg_charge_ret == -ENOMEM) {
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
- }
-
idx = hstate_index(h);
- /*
- * Examine the region/reserve map to determine if the process
- * has a reservation for the page to be allocated. A return
- * code of zero indicates a reservation exists (no change).
- */
- map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0) {
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
+
+ /* Whether we need a separate per-vma reservation? */
+ if (cow_from_owner) {
+ /*
+ * Special case! Since it's a CoW on top of a reserved
+ * page, the private resv map doesn't count. So it cannot
+ * consume the per-vma resv map even if it's reserved.
+ */
+ map_chg = MAP_CHG_ENFORCED;
+ } else {
+ /*
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
+ */
+ retval = vma_needs_reservation(h, vma, addr);
+ if (retval < 0)
+ return ERR_PTR(-ENOMEM);
+ map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
}
/*
+ * Whether we need a separate global reservation?
+ *
* Processes that did not create the mapping will have no
* reserves as indicated by the region/reserve map. Check
* that the allocation will not exceed the subpool limit.
- * Allocations for MAP_NORESERVE mappings also need to be
- * checked against any subpool limit.
+ * Or if it can get one from the pool reservation directly.
*/
- if (map_chg || avoid_reserve) {
+ if (map_chg) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0)
goto out_end_reservation;
+ } else {
+ /*
+ * If we have the vma reservation ready, no need for extra
+ * global reservation.
+ */
+ gbl_chg = 0;
}
- /* If this allocation is not consuming a reservation, charge it now.
+ /*
+ * If this allocation is not consuming a per-vma reservation,
+ * charge the hugetlb cgroup now.
*/
- deferred_reserve = map_chg || avoid_reserve;
- if (deferred_reserve) {
+ if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
if (ret)
@@ -3031,20 +3048,25 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
if (!folio)
goto out_uncharge_cgroup;
spin_lock_irq(&hugetlb_lock);
- if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
list_add(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
/* Fall through */
}
+ /*
+ * Either dequeued or buddy-allocated folio needs to add special
+ * mark to the folio when it consumes a global reservation.
+ */
+ if (!gbl_chg) {
+ folio_set_hugetlb_restore_reserve(folio);
+ h->resv_huge_pages--;
+ }
+
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
- if (deferred_reserve) {
+ if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
h_cg, folio);
}
@@ -3053,53 +3075,122 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_set_folio_subpool(folio, spool);
- map_commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(map_chg > map_commit)) {
+ if (map_chg != MAP_CHG_ENFORCED) {
+ /* commit() is only needed if the map_chg is not enforced */
+ retval = vma_commit_reservation(h, vma, addr);
/*
+ * Check for possible race conditions. When it happens..
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
* This indicates a race with hugetlb_reserve_pages.
* Adjust for the subpool count incremented above AND
- * in hugetlb_reserve_pages for the same page. Also,
+ * in hugetlb_reserve_pages for the same page. Also,
* the reservation count added in hugetlb_reserve_pages
* no longer applies.
*/
- long rsv_adjust;
+ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
+ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1);
- hugetlb_acct_memory(h, -rsv_adjust);
- if (deferred_reserve) {
- spin_lock_irq(&hugetlb_lock);
- hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
- pages_per_huge_page(h), folio);
- spin_unlock_irq(&hugetlb_lock);
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ if (map_chg) {
+ spin_lock_irq(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_folio_rsvd(
+ hstate_index(h), pages_per_huge_page(h),
+ folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
}
}
- if (!memcg_charge_ret)
- mem_cgroup_commit_charge(folio, memcg);
+ ret = mem_cgroup_charge_hugetlb(folio, gfp);
+ /*
+ * Unconditionally increment NR_HUGETLB here. If it turns out that
+ * mem_cgroup_charge_hugetlb failed, then immediately free the page and
+ * decrement NR_HUGETLB.
+ */
lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
- mem_cgroup_put(memcg);
+
+ if (ret == -ENOMEM) {
+ free_huge_folio(folio);
+ return ERR_PTR(-ENOMEM);
+ }
return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
- if (deferred_reserve)
+ if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
+ /*
+ * put page to subpool iff the quota of subpool's rsv_hpages is used
+ * during hugepage_subpool_get_pages.
+ */
+ if (map_chg && !gbl_chg) {
+ gbl_reserve = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+
out_end_reservation:
- vma_end_reservation(h, vma, addr);
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
+ if (map_chg != MAP_CHG_ENFORCED)
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
+static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
+{
+ struct huge_bootmem_page *m;
+ int listnode = nid;
+
+ if (hugetlb_early_cma(h))
+ m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
+ else {
+ if (node_exact)
+ m = memblock_alloc_exact_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ else {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ /*
+ * For pre-HVO to work correctly, pages need to be on
+ * the list for the node they were actually allocated
+ * from. That node may be different in the case of
+ * fallback by memblock_alloc_try_nid_raw. So,
+ * extract the actual node first.
+ */
+ if (m)
+ listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
+ }
+
+ if (m) {
+ m->flags = 0;
+ m->cma = NULL;
+ }
+ }
+
+ if (m) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ *
+ * Put them into a private list first because mem_map
+ * is not up yet.
+ */
+ INIT_LIST_HEAD(&m->list);
+ list_add(&m->list, &huge_boot_pages[listnode]);
+ m->hstate = h;
+ }
+
+ return m;
+}
+
int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
@@ -3109,22 +3200,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
- m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ m = alloc_bootmem(h, node, true);
if (!m)
return 0;
goto found;
}
+
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
- m = memblock_alloc_try_nid_raw(
- huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
+ &hugetlb_bootmem_nodes) {
+ m = alloc_bootmem(h, node, false);
if (!m)
return 0;
goto found;
@@ -3141,10 +3226,7 @@ found:
*/
memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
huge_page_size(h) - PAGE_SIZE);
- /* Put them into a private list first because mem_map is not up yet */
- INIT_LIST_HEAD(&m->list);
- list_add(&m->list, &huge_boot_pages[node]);
- m->hstate = h;
+
return 1;
}
@@ -3162,7 +3244,6 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
- __ClearPageReserved(folio_page(folio, pfn - head_pfn));
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
ret = page_ref_freeze(page, 1);
@@ -3186,6 +3267,42 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
prep_compound_head((struct page *)folio, huge_page_order(h));
}
+static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_HVO;
+}
+
+static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_CMA;
+}
+
+/*
+ * memblock-allocated pageblocks might not have the migrate type set
+ * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
+ * here, or MIGRATE_CMA if this was a page allocated through an early CMA
+ * reservation.
+ *
+ * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
+ * read-only, but that's ok - for sparse vmemmap this does not write to
+ * the page structure.
+ */
+static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
+ struct hstate *h)
+{
+ unsigned long nr_pages = pages_per_huge_page(h), i;
+
+ WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
+
+ for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
+ if (folio_test_hugetlb_cma(folio))
+ init_cma_pageblock(folio_page(folio, i));
+ else
+ init_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE, false);
+ }
+}
+
static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct list_head *folio_list)
{
@@ -3193,7 +3310,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct folio *folio, *tmp_f;
/* Send list for bulk vmemmap optimization processing */
- hugetlb_vmemmap_optimize_folios(h, folio_list);
+ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
@@ -3207,6 +3324,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
HUGETLB_VMEMMAP_RESERVE_PAGES,
pages_per_huge_page(h));
}
+ hugetlb_bootmem_init_migratetype(folio, h);
/* Subdivide locks to achieve better parallel performance */
spin_lock_irqsave(&hugetlb_lock, flags);
__prep_account_new_huge_page(h, folio_nid(folio));
@@ -3215,6 +3333,57 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
}
}
+bool __init hugetlb_bootmem_page_zones_valid(int nid,
+ struct huge_bootmem_page *m)
+{
+ unsigned long start_pfn;
+ bool valid;
+
+ if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
+ /*
+ * Already validated, skip check.
+ */
+ return true;
+ }
+
+ if (hugetlb_bootmem_page_earlycma(m)) {
+ valid = cma_validate_zones(m->cma);
+ goto out;
+ }
+
+ start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
+
+ valid = !pfn_range_intersects_zones(nid, start_pfn,
+ pages_per_huge_page(m->hstate));
+out:
+ if (!valid)
+ hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
+
+ return valid;
+}
+
+/*
+ * Free a bootmem page that was found to be invalid (intersecting with
+ * multiple zones).
+ *
+ * Since it intersects with multiple zones, we can't just do a free
+ * operation on all pages at once, but instead have to walk all
+ * pages, freeing them one by one.
+ */
+static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
+ struct hstate *h)
+{
+ unsigned long npages = pages_per_huge_page(h);
+ unsigned long pfn;
+
+ while (npages--) {
+ pfn = page_to_pfn(page);
+ __init_page_from_nid(pfn, nid);
+ free_reserved_page(page);
+ page++;
+ }
+}
+
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
@@ -3222,14 +3391,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
LIST_HEAD(folio_list);
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m, *tm;
struct hstate *h = NULL, *prev_h = NULL;
- list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
struct page *page = virt_to_page(m);
struct folio *folio = (void *)page;
h = m->hstate;
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Can't use this page. Initialize the
+ * page structures if that hasn't already
+ * been done, and give them to the page
+ * allocator.
+ */
+ hugetlb_bootmem_free_invalid_page(nid, page, h);
+ continue;
+ }
+
/*
* It is possible to have multiple huge page sizes (hstates)
* in this list. If so, process each size separately.
@@ -3244,14 +3424,30 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
hugetlb_folio_init_vmemmap(folio, h,
HUGETLB_VMEMMAP_RESERVE_PAGES);
init_new_hugetlb_folio(h, folio);
+
+ if (hugetlb_bootmem_page_prehvo(m))
+ /*
+ * If pre-HVO was done, just set the
+ * flag, the HVO code will then skip
+ * this folio.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
+
+ if (hugetlb_bootmem_page_earlycma(m))
+ folio_set_hugetlb_cma(folio);
+
list_add(&folio->lru, &folio_list);
/*
* We need to restore the 'stolen' pages to totalram_pages
* in order to fix confusing memory reports from free(1) and
* other side-effects, like CommitLimit going negative.
+ *
+ * For CMA pages, this is done in init_cma_pageblock
+ * (via hugetlb_bootmem_init_migratetype), so skip it here.
*/
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ if (!folio_test_hugetlb_cma(folio))
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
@@ -3391,32 +3587,44 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
.numa_aware = true
};
+ unsigned long jiffies_start;
+ unsigned long jiffies_end;
+
job.thread_fn = hugetlb_pages_alloc_boot_node;
job.start = 0;
job.size = h->max_huge_pages;
/*
- * job.max_threads is twice the num_node_state(N_MEMORY),
+ * job.max_threads is 25% of the available cpu threads by default.
*
- * Tests below indicate that a multiplier of 2 significantly improves
- * performance, and although larger values also provide improvements,
- * the gains are marginal.
+ * On large servers with terabytes of memory, huge page allocation
+ * can consume a considerably amount of time.
*
- * Therefore, choosing 2 as the multiplier strikes a good balance between
- * enhancing parallel processing capabilities and maintaining efficient
- * resource management.
+ * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
+ * 2MiB huge pages. Using more threads can significantly improve allocation time.
*
- * +------------+-------+-------+-------+-------+-------+
- * | multiplier | 1 | 2 | 3 | 4 | 5 |
- * +------------+-------+-------+-------+-------+-------+
- * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
- * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms |
- * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms |
- * +------------+-------+-------+-------+-------+-------+
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | threads | 8 | 16 | 32 | 64 | 128 |
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s |
+ * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s |
+ * +-----------------------+-------+-------+-------+-------+-------+
*/
- job.max_threads = num_node_state(N_MEMORY) * 2;
- job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+ if (hugepage_allocation_threads == 0) {
+ hugepage_allocation_threads = num_online_cpus() / 4;
+ hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
+ }
+
+ job.max_threads = hugepage_allocation_threads;
+ job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
+
+ jiffies_start = jiffies;
padata_do_multithreaded(&job);
+ jiffies_end = jiffies;
+
+ pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
+ jiffies_to_msecs(jiffies_end - jiffies_start),
+ hugepage_allocation_threads);
return h->nr_huge_pages;
}
@@ -3435,22 +3643,19 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long allocated;
- static bool initialized __initdata;
- /* skip gigantic hugepages allocation if hugetlb_cma enabled */
- if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ /*
+ * Skip gigantic hugepages allocation if early CMA
+ * reservations are not available.
+ */
+ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
+ !hugetlb_early_cma(h)) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
return;
}
- /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
- if (!initialized) {
- int i = 0;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- INIT_LIST_HEAD(&huge_boot_pages[i]);
- initialized = true;
- }
+ if (!h->max_huge_pages)
+ return;
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
@@ -3470,6 +3675,15 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
+ /*
+ * Always reset to first_memory_node here, even if
+ * next_nid_to_alloc was set before - we can't
+ * reference hugetlb_bootmem_nodes after init, and
+ * first_memory_node is right for all further allocations.
+ */
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3484,7 +3698,7 @@ static void __init hugetlb_init_hstates(void)
*/
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
continue;
- if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
continue;
for_each_hstate(h2) {
if (h2 == h)
@@ -3499,13 +3713,20 @@ static void __init hugetlb_init_hstates(void)
static void __init report_hugepages(void)
{
struct hstate *h;
+ unsigned long nrinvalid;
for_each_hstate(h) {
char buf[32];
+ nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
+ h->max_huge_pages -= nrinvalid;
+
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
- buf, h->free_huge_pages);
+ buf, h->nr_huge_pages);
+ if (nrinvalid)
+ pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
+ buf, nrinvalid, str_plural(nrinvalid));
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -3587,6 +3808,7 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
+ unsigned long persistent_free_count;
unsigned long min_count;
unsigned long allocated;
struct folio *folio;
@@ -3721,8 +3943,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
+ *
+ * min_count is the expected number of persistent pages, we
+ * shouldn't calculate min_count by using
+ * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
+ * because there may exist free surplus huge pages, and this will
+ * lead to subtracting twice. Free surplus huge pages come from HVO
+ * failing to restore vmemmap, see comments in the callers of
+ * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
+ * persistent free count first.
*/
- min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ persistent_free_count = h->free_huge_pages;
+ if (h->free_huge_pages > persistent_huge_pages(h)) {
+ if (h->free_huge_pages > h->surplus_huge_pages)
+ persistent_free_count -= h->surplus_huge_pages;
+ else
+ persistent_free_count = 0;
+ }
+ min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
@@ -3779,10 +4017,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
list_for_each_entry_safe(folio, next, src_list, lru) {
int i;
+ bool cma;
if (folio_test_hugetlb_vmemmap_optimized(folio))
continue;
+ cma = folio_test_hugetlb_cma(folio);
+
list_del(&folio->lru);
split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
@@ -3790,13 +4031,18 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
struct page *page = folio_page(folio, i);
+ /* Careful: see __split_huge_page_tail() */
+ struct folio *new_folio = (struct folio *)page;
- page->mapping = NULL;
clear_compound_head(page);
prep_compound_page(page, dst->order);
- init_new_hugetlb_folio(dst, page_folio(page));
- list_add(&page->lru, &dst_list);
+ new_folio->mapping = NULL;
+ init_new_hugetlb_folio(dst, new_folio);
+ /* Copy the CMA flag so that it is freed correctly */
+ if (cma)
+ folio_set_hugetlb_cma(new_folio);
+ list_add(&new_folio->lru, &dst_list);
}
}
@@ -4377,14 +4623,6 @@ static void hugetlb_register_all_nodes(void) { }
#endif
-#ifdef CONFIG_CMA
-static void __init hugetlb_cma_check(void);
-#else
-static inline __init void hugetlb_cma_check(void)
-{
-}
-#endif
-
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
@@ -4398,7 +4636,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
}
#ifdef CONFIG_NUMA
@@ -4509,8 +4747,6 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_memory_node;
- h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/SZ_1K);
@@ -4535,6 +4771,44 @@ static void __init hugepages_clear_pages_in_node(void)
}
}
+static __init int hugetlb_add_param(char *s, int (*setup)(char *))
+{
+ size_t len;
+ char *p;
+
+ if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
+ return -EINVAL;
+
+ len = strlen(s) + 1;
+ if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
+ return -EINVAL;
+
+ p = &hstate_cmdline_buf[hstate_cmdline_index];
+ memcpy(p, s, len);
+ hstate_cmdline_index += len;
+
+ hugetlb_params[hugetlb_param_index].val = p;
+ hugetlb_params[hugetlb_param_index].setup = setup;
+
+ hugetlb_param_index++;
+
+ return 0;
+}
+
+static __init void hugetlb_parse_params(void)
+{
+ int i;
+ struct hugetlb_cmdline *hcp;
+
+ for (i = 0; i < hugetlb_param_index; i++) {
+ hcp = &hugetlb_params[i];
+
+ hcp->setup(hcp->val);
+ }
+
+ hugetlb_cma_validate_params();
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4554,7 +4828,7 @@ static int __init hugepages_setup(char *s)
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return -EINVAL;
}
/*
@@ -4608,24 +4882,16 @@ static int __init hugepages_setup(char *s)
}
}
- /*
- * Global state is always initialized later in hugetlb_init.
- * But we need to allocate gigantic hstates here early to still
- * use the bootmem allocator.
- */
- if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
- hugetlb_hstate_alloc_pages(parsed_hstate);
-
last_mhp = mhp;
- return 1;
+ return 0;
invalid:
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
hugepages_clear_pages_in_node();
- return 1;
+ return -EINVAL;
}
-__setup("hugepages=", hugepages_setup);
+hugetlb_early_param("hugepages", hugepages_setup);
/*
* hugepagesz command line processing
@@ -4644,7 +4910,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
h = size_to_hstate(size);
@@ -4659,7 +4925,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 1;
+ return -EINVAL;
}
/*
@@ -4669,14 +4935,14 @@ static int __init hugepagesz_setup(char *s)
*/
parsed_hstate = h;
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
-__setup("hugepagesz=", hugepagesz_setup);
+hugetlb_early_param("hugepagesz", hugepagesz_setup);
/*
* default_hugepagesz command line input
@@ -4690,14 +4956,14 @@ static int __init default_hugepagesz_setup(char *s)
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 1;
+ return -EINVAL;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4714,17 +4980,89 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- for_each_online_node(i)
- default_hstate.max_huge_pages_node[i] =
- default_hugepages_in_node[i];
- if (hstate_is_gigantic(&default_hstate))
- hugetlb_hstate_alloc_pages(&default_hstate);
+ /*
+ * Since this is an early parameter, we can't check
+ * NUMA node state yet, so loop through MAX_NUMNODES.
+ */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (default_hugepages_in_node[i] != 0)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
+ }
default_hstate_max_huge_pages = 0;
}
+ return 0;
+}
+hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+
+void __init hugetlb_bootmem_set_nodes(void)
+{
+ int i, nid;
+ unsigned long start_pfn, end_pfn;
+
+ if (!nodes_empty(hugetlb_bootmem_nodes))
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (end_pfn > start_pfn)
+ node_set(nid, hugetlb_bootmem_nodes);
+ }
+}
+
+static bool __hugetlb_bootmem_allocated __initdata;
+
+bool __init hugetlb_bootmem_allocated(void)
+{
+ return __hugetlb_bootmem_allocated;
+}
+
+void __init hugetlb_bootmem_alloc(void)
+{
+ struct hstate *h;
+ int i;
+
+ if (__hugetlb_bootmem_allocated)
+ return;
+
+ hugetlb_bootmem_set_nodes();
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&huge_boot_pages[i]);
+
+ hugetlb_parse_params();
+
+ for_each_hstate(h) {
+ h->next_nid_to_alloc = first_online_node;
+
+ if (hstate_is_gigantic(h))
+ hugetlb_hstate_alloc_pages(h);
+ }
+
+ __hugetlb_bootmem_allocated = true;
+}
+
+/*
+ * hugepage_alloc_threads command line parsing.
+ *
+ * When set, use this specific number of threads for the boot
+ * allocation of hugepages.
+ */
+static int __init hugepage_alloc_threads_setup(char *s)
+{
+ unsigned long allocation_threads;
+
+ if (kstrtoul(s, 0, &allocation_threads) != 0)
+ return 1;
+
+ if (allocation_threads == 0)
+ return 1;
+
+ hugepage_allocation_threads = allocation_threads;
+
return 1;
}
-__setup("default_hugepagesz=", default_hugepagesz_setup);
+__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);
static unsigned int allowed_mems_nr(struct hstate *h)
{
@@ -4829,7 +5167,7 @@ out:
return ret;
}
-static struct ctl_table hugetlb_table[] = {
+static const struct ctl_table hugetlb_table[] = {
{
.procname = "nr_hugepages",
.data = NULL,
@@ -4862,7 +5200,7 @@ static struct ctl_table hugetlb_table[] = {
},
};
-static void hugetlb_sysctl_init(void)
+static void __init hugetlb_sysctl_init(void)
{
register_sysctl_init("vm", hugetlb_table);
}
@@ -5070,26 +5408,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -5124,18 +5476,16 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.pagesize = hugetlb_vm_op_pagesize,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
- int writable)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
+ bool try_mkwrite)
{
- pte_t entry;
+ pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
unsigned int shift = huge_page_shift(hstate_vma(vma));
- if (writable) {
- entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
+ entry = pte_mkwrite_novma(pte_mkdirty(entry));
} else {
- entry = huge_pte_wrprotect(mk_huge_pte(page,
- vma->vm_page_prot));
+ entry = pte_wrprotect(entry);
}
entry = pte_mkyoung(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
@@ -5153,6 +5503,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ if (vma->vm_flags & VM_WRITE)
+ set_huge_ptep_writable(vma, address, ptep);
+}
+
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
@@ -5183,7 +5540,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+ pte_t newpte = make_huge_pte(vma, new_folio, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5240,18 +5597,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}
- /*
- * If the pagetables are shared don't copy or take references.
- *
- * dst_pte == src_pte is the common case of src/dest sharing.
- * However, src could have 'unshared' and dst shares with
- * another vma. So page_count of ptep page is checked instead
- * to reliably determine whether pte is shared.
- */
- if (page_count(virt_to_page(dst_pte)) > 1) {
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+ /* If the pagetables are shared, there is nothing to do */
+ if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
addr |= last_addr_mask;
continue;
}
+#endif
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -5317,7 +5669,7 @@ again:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
if (IS_ERR(new_folio)) {
folio_put(pte_folio);
ret = PTR_ERR(new_folio);
@@ -5487,17 +5839,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+ struct folio *folio, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ const bool folio_provided = !!folio;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
- struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- bool adjust_reservation = false;
+ bool adjust_reservation;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5558,14 +5910,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
}
- page = pte_page(pte);
/*
- * If a reference page is supplied, it is because a specific
- * page is being unmapped, not a range. Ensure the page we
- * are about to unmap is the actual page of interest.
+ * If a folio is supplied, it is because a specific
+ * folio is being unmapped, not a range. Ensure the folio we
+ * are about to unmap is the actual folio of interest.
*/
- if (ref_page) {
- if (page != ref_page) {
+ if (folio_provided) {
+ if (folio != page_folio(pte_page(pte))) {
spin_unlock(ptl);
continue;
}
@@ -5575,12 +5926,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ } else {
+ folio = page_folio(pte_page(pte));
}
pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
@@ -5588,7 +5941,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
make_pte_marker(PTE_MARKER_UFFD_WP),
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- hugetlb_remove_rmap(page_folio(page));
+ hugetlb_remove_rmap(folio);
+ spin_unlock(ptl);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5596,14 +5950,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If there we are freeing a surplus, do not set the restore
* reservation bit.
*/
+ adjust_reservation = false;
+
+ spin_lock_irq(&hugetlb_lock);
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
- folio_test_anon(page_folio(page))) {
- folio_set_hugetlb_restore_reserve(page_folio(page));
+ folio_test_anon(folio)) {
+ folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
-
- spin_unlock(ptl);
+ spin_unlock_irq(&hugetlb_lock);
/*
* Adjust the reservation for the region that will have the
@@ -5622,16 +5978,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* count will not be incremented by free_huge_folio.
* Act as if we consumed the reservation.
*/
- folio_clear_hugetlb_restore_reserve(page_folio(page));
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
vma_add_reservation(h, vma, address);
}
- tlb_remove_page_size(tlb, page, huge_page_size(h));
+ tlb_remove_page_size(tlb, folio_page(folio, 0),
+ folio_size(folio));
/*
- * Bail out after unmapping reference page if supplied
+ * If we were instructed to unmap a specific folio, we're done.
*/
- if (ref_page)
+ if (folio_provided)
break;
}
tlb_end_vma(tlb, vma);
@@ -5693,7 +6050,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
+ unsigned long end, struct folio *folio,
zap_flags_t zap_flags)
{
struct mmu_notifier_range range;
@@ -5705,7 +6062,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+ __unmap_hugepage_range(&tlb, vma, start, end,
+ folio, zap_flags);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
@@ -5718,7 +6076,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page *page, unsigned long address)
+ struct folio *folio, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
@@ -5762,7 +6120,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page, 0);
+ address + huge_page_size(h),
+ folio, 0);
}
i_mmap_unlock_write(mapping);
}
@@ -5773,8 +6132,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
- struct vm_fault *vmf)
+static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -5783,7 +6141,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
struct folio *new_folio;
- int outside_reserve = 0;
+ bool cow_from_owner = 0;
vm_fault_t ret = 0;
struct mmu_notifier_range range;
@@ -5798,13 +6156,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
if (!unshare && huge_pte_uffd_wp(pte))
return 0;
- /*
- * hugetlb does not support FOLL_FORCE-style write faults that keep the
- * PTE mapped R/O such as maybe_mkwrite() would do.
- */
- if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
- return VM_FAULT_SIGSEGV;
-
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, vmf->address, vmf->pte);
@@ -5833,7 +6184,8 @@ retry_avoidcopy:
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, vmf->address, vmf->pte);
+ set_huge_ptep_maybe_writable(vma, vmf->address,
+ vmf->pte);
delayacct_wpcopy_end();
return 0;
@@ -5842,17 +6194,18 @@ retry_avoidcopy:
PageAnonExclusive(&old_folio->page), &old_folio->page);
/*
- * If the process that created a MAP_PRIVATE mapping is about to
- * perform a COW due to a shared page count, attempt to satisfy
- * the allocation without using the existing reserves. The pagecache
- * page is used to determine if the reserve at this address was
- * consumed or not. If reserves were used, a partial faulted mapping
- * at the time of fork() could consume its reserves on COW instead
- * of the full address range.
+ * If the process that created a MAP_PRIVATE mapping is about to perform
+ * a COW due to a shared page count, attempt to satisfy the allocation
+ * without using the existing reserves.
+ * In order to determine where this is a COW on a MAP_PRIVATE mapping it
+ * is enough to check whether the old_folio is anonymous. This means that
+ * the reserve for this address was consumed. If reserves were used, a
+ * partial faulted mapping at the fime of fork() could consume its reserves
+ * on COW instead of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_folio != pagecache_folio)
- outside_reserve = 1;
+ folio_test_anon(old_folio))
+ cow_from_owner = true;
folio_get(old_folio);
@@ -5861,7 +6214,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(vmf->ptl);
- new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
if (IS_ERR(new_folio)) {
/*
@@ -5871,7 +6224,7 @@ retry_avoidcopy:
* reliability, unmap the page from child processes. The child
* may get SIGKILLed if it later faults.
*/
- if (outside_reserve) {
+ if (cow_from_owner) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx;
u32 hash;
@@ -5891,8 +6244,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, &old_folio->page,
- vmf->address);
+ unmap_ref_private(mm, vma, old_folio, vmf->address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -5939,7 +6291,7 @@ retry_avoidcopy:
spin_lock(vmf->ptl);
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
/* Break COW or unshare */
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
@@ -6055,16 +6407,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
static vm_fault_t hugetlb_no_page(struct address_space *mapping,
struct vm_fault *vmf)
{
+ u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
+ bool new_folio, new_anon_folio = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
- int anon_rmap = 0;
- unsigned long size;
+ bool folio_locked = true;
struct folio *folio;
+ unsigned long size;
pte_t new_pte;
- bool new_folio, new_pagecache_folio = false;
- u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
/*
* Currently, we are forced to kill the process in the event the
@@ -6122,7 +6474,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
goto out;
}
- folio = alloc_hugetlb_folio(vma, vmf->address, 0);
+ folio = alloc_hugetlb_folio(vma, vmf->address, false);
if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
@@ -6163,10 +6515,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
ret = VM_FAULT_SIGBUS;
goto out;
}
- new_pagecache_folio = true;
} else {
+ new_anon_folio = true;
folio_lock(folio);
- anon_rmap = 1;
}
} else {
/*
@@ -6215,12 +6566,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
goto backout;
- if (anon_rmap)
+ if (new_anon_folio)
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
- && (vma->vm_flags & VM_SHARED)));
+ new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -6231,8 +6581,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ /*
+ * No need to keep file folios locked. See comment in
+ * hugetlb_fault().
+ */
+ if (!new_anon_folio) {
+ folio_locked = false;
+ folio_unlock(folio);
+ }
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(folio, vmf);
+ ret = hugetlb_wp(vmf);
}
spin_unlock(vmf->ptl);
@@ -6245,7 +6603,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (new_folio)
folio_set_hugetlb_migratable(folio);
- folio_unlock(folio);
+ if (folio_locked)
+ folio_unlock(folio);
out:
hugetlb_vma_unlock_read(vma);
@@ -6262,7 +6621,8 @@ out:
backout:
spin_unlock(vmf->ptl);
backout_unlocked:
- if (new_folio && !new_pagecache_folio)
+ /* We only need to restore reservations for private mappings */
+ if (new_anon_folio)
restore_reserve_on_error(h, vma, vmf->address, folio);
folio_unlock(folio);
@@ -6300,10 +6660,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
- struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
- int need_wait_lock = 0;
+ bool need_wait_lock = false;
struct vm_fault vmf = {
.vma = vma,
.address = address & huge_page_mask(h),
@@ -6369,15 +6728,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = 0;
- /*
- * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
- * point, so this check prevents the kernel from going below assuming
- * that we have an active hugepage in pagecache. This goto expects
- * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
- * check will properly handle it.
- */
+ /* Not present, either a migration or a hwpoisoned entry */
if (!pte_present(vmf.orig_pte)) {
- if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
+ if (is_hugetlb_entry_migration(vmf.orig_pte)) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6388,7 +6741,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, vmf.address, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
+ } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6398,8 +6751,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
- * spinlock. Also lookup the pagecache page now as it is used to
- * determine if a reservation has been consumed.
+ * spinlock.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
@@ -6409,11 +6761,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, vmf.address);
-
- pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
- vmf.pgoff);
- if (IS_ERR(pagecache_folio))
- pagecache_folio = NULL;
}
vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
@@ -6427,10 +6774,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
spin_unlock(vmf.ptl);
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return handle_userfault(&vmf, VM_UFFD_WP);
@@ -6442,24 +6785,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Fallthrough to CoW */
}
- /*
- * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
- * pagecache_folio, so here we need take the former one
- * when folio != pagecache_folio or !pagecache_folio.
- */
- folio = page_folio(pte_page(vmf.orig_pte));
- if (folio != pagecache_folio)
- if (!folio_trylock(folio)) {
- need_wait_lock = 1;
- goto out_ptl;
- }
-
- folio_get(folio);
-
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(vmf.orig_pte)) {
- ret = hugetlb_wp(pagecache_folio, &vmf);
- goto out_put_page;
+ /*
+ * Anonymous folios need to be lock since hugetlb_wp()
+ * checks whether we can re-use the folio exclusively
+ * for us in case we are the only user of it.
+ */
+ folio = page_folio(pte_page(vmf.orig_pte));
+ if (folio_test_anon(folio) && !folio_trylock(folio)) {
+ need_wait_lock = true;
+ goto out_ptl;
+ }
+ folio_get(folio);
+ ret = hugetlb_wp(&vmf);
+ if (folio_test_anon(folio))
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out_ptl;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
@@ -6468,17 +6811,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, vmf.address, vmf.pte);
-out_put_page:
- if (folio != pagecache_folio)
- folio_unlock(folio);
- folio_put(folio);
out_ptl:
spin_unlock(vmf.ptl);
-
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
out_mutex:
hugetlb_vma_unlock_read(vma);
@@ -6491,11 +6825,16 @@ out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
- * the page is not used after unlocked before returning from the current
- * page fault. So we are safe from accessing freed page, even if we wait
- * here without taking refcount.
+ * hugetlb_wp drops all the locks, but the folio lock, before trying to
+ * unmap the folio from other processes. During that window, if another
+ * process mapping that folio faults in, it will take the mutex and then
+ * it will wait on folio_lock, causing an ABBA deadlock.
+ * Use trylock instead and bail out if we fail.
+ *
+ * Ideally, we should hold a refcount on the folio we wait for, but we do
+ * not want to use the folio after it becomes unlocked, but rather just
+ * wait for it to become unlocked, so hopefully next fault successes on
+ * the trylock.
*/
if (need_wait_lock)
folio_wait_locked(folio);
@@ -6552,7 +6891,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
spinlock_t *ptl;
int ret = -ENOMEM;
struct folio *folio;
- int writable;
bool folio_in_pagecache = false;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
@@ -6590,7 +6928,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
@@ -6632,7 +6970,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
folio_put(*foliop);
ret = -ENOMEM;
@@ -6706,12 +7044,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- if (wp_enabled || (is_continue && !vm_shared))
- writable = 0;
- else
- writable = dst_vma->vm_flags & VM_WRITE;
-
- _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
+ _dst_pte = make_huge_pte(dst_vma, folio,
+ !wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
* extremely important for hugetlbfs for now since swapping is not
@@ -6820,11 +7154,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
/* Nothing to do. */
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
pte_t newpte = pte;
if (is_writable_migration_entry(entry)) {
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
@@ -6872,6 +7206,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
psize);
}
spin_unlock(ptl);
+
+ cond_resched();
}
/*
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
@@ -6898,13 +7234,20 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
return pages > 0 ? (pages << h->order) : pages;
}
-/* Return true if reservation was successful, false otherwise. */
-bool hugetlb_reserve_pages(struct inode *inode,
+/*
+ * Update the reservation map for the range [from, to].
+ *
+ * Returns the number of entries that would be added to the reservation map
+ * associated with the range [from, to]. This number is greater or equal to
+ * zero. -EINVAL or -ENOMEM is returned in case of any errors.
+ */
+
+long hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg = -1, add = -1;
+ long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -6914,7 +7257,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return false;
+ return -EINVAL;
}
/*
@@ -6929,7 +7272,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return true;
+ return 0;
/*
* Shared mappings base their reservation on the number of pages that
@@ -7036,11 +7379,19 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
- return true;
+ return chg;
out_put_pages:
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ spool_resv = chg - gbl_reserve;
+ if (spool_resv) {
+ /* put sub pool's reservation back, chg - gbl_reserve */
+ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
+ /*
+ * subpool's reserved pages can not be put back due to race,
+ * return to hstate.
+ */
+ hugetlb_acct_memory(h, -gbl_resv);
+ }
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
@@ -7056,7 +7407,7 @@ out_err:
kref_put(&resv_map->refs, resv_map_release);
set_vma_resv_map(vma, NULL);
}
- return false;
+ return chg < 0 ? chg : add < 0 ? add : -EINVAL;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -7111,8 +7462,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
@@ -7251,10 +7602,17 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
hugetlb_vma_assert_locked(vma);
if (sz != PMD_SIZE)
return 0;
- if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
+ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
@@ -7390,7 +7748,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-bool isolate_hugetlb(struct folio *folio, struct list_head *list)
+/**
+ * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
+ * @folio: the folio to isolate
+ * @list: the list to add the folio to on success
+ *
+ * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
+ * isolated/non-migratable, and moving it from the active list to the
+ * given list.
+ *
+ * Isolation will fail if @folio is not an allocated hugetlb folio, or if
+ * it is already isolated/non-migratable.
+ *
+ * On success, an additional folio reference is taken that must be dropped
+ * using folio_putback_hugetlb() to undo the isolation.
+ *
+ * Return: True if isolation worked, otherwise False.
+ */
+bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
bool ret = true;
@@ -7438,7 +7813,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
-void folio_putback_active_hugetlb(struct folio *folio)
+/**
+ * folio_putback_hugetlb - unisolate a hugetlb folio
+ * @folio: the isolated hugetlb folio
+ *
+ * Putback/un-isolate the hugetlb folio that was previous isolated using
+ * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
+ * back onto the active list.
+ *
+ * Will drop the additional folio reference obtained through
+ * folio_isolate_hugetlb().
+ */
+void folio_putback_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
folio_set_hugetlb_migratable(folio);
@@ -7452,7 +7838,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
struct hstate *h = folio_hstate(old_folio);
hugetlb_cgroup_migrate(old_folio, new_folio);
- set_page_owner_migrate_reason(&new_folio->page, reason);
+ folio_set_owner_migrate_reason(new_folio, reason);
/*
* transfer temporary state of the new hugetlb folio. This is
@@ -7485,11 +7871,28 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
spin_unlock_irq(&hugetlb_lock);
}
+
+ /*
+ * Our old folio is isolated and has "migratable" cleared until it
+ * is putback. As migration succeeded, set the new folio "migratable"
+ * and add it to the active list.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ folio_set_hugetlb_migratable(new_folio);
+ list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
+ spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7513,8 +7916,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7524,8 +7931,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7540,165 +7949,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
-}
-
-#ifdef CONFIG_CMA
-static bool cma_reserve_called __initdata;
-
-static int __init cmdline_parse_hugetlb_cma(char *p)
-{
- int nid, count = 0;
- unsigned long tmp;
- char *s = p;
-
- while (*s) {
- if (sscanf(s, "%lu%n", &tmp, &count) != 1)
- break;
-
- if (s[count] == ':') {
- if (tmp >= MAX_NUMNODES)
- break;
- nid = array_index_nospec(tmp, MAX_NUMNODES);
-
- s += count + 1;
- tmp = memparse(s, &s);
- hugetlb_cma_size_in_node[nid] = tmp;
- hugetlb_cma_size += tmp;
-
- /*
- * Skip the separator if have one, otherwise
- * break the parsing.
- */
- if (*s == ',')
- s++;
- else
- break;
- } else {
- hugetlb_cma_size = memparse(p, &p);
- break;
- }
- }
-
- return 0;
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
-early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
-
-void __init hugetlb_cma_reserve(int order)
-{
- unsigned long size, reserved, per_node;
- bool node_specific_cma_alloc = false;
- int nid;
-
- /*
- * HugeTLB CMA reservation is required for gigantic
- * huge pages which could not be allocated via the
- * page allocator. Just warn if there is any change
- * breaking this assumption.
- */
- VM_WARN_ON(order <= MAX_PAGE_ORDER);
- cma_reserve_called = true;
-
- if (!hugetlb_cma_size)
- return;
-
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- if (!node_online(nid)) {
- pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- continue;
- }
-
- if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
- nid, (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- } else {
- node_specific_cma_alloc = true;
- }
- }
-
- /* Validate the CMA size again in case some invalid nodes specified. */
- if (!hugetlb_cma_size)
- return;
-
- if (hugetlb_cma_size < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
- (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size = 0;
- return;
- }
-
- if (!node_specific_cma_alloc) {
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
- }
-
- reserved = 0;
- for_each_online_node(nid) {
- int res;
- char name[CMA_MAX_NAME];
-
- if (node_specific_cma_alloc) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- size = hugetlb_cma_size_in_node[nid];
- } else {
- size = min(per_node, hugetlb_cma_size - reserved);
- }
-
- size = round_up(size, PAGE_SIZE << order);
-
- snprintf(name, sizeof(name), "hugetlb%d", nid);
- /*
- * Note that 'order per bit' is based on smallest size that
- * may be returned to CMA allocator in the case of
- * huge page demotion.
- */
- res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << order,
- HUGETLB_PAGE_ORDER, false, name,
- &hugetlb_cma[nid], nid);
- if (res) {
- pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
- res, nid);
- continue;
- }
-
- reserved += size;
- pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
- size / SZ_1M, nid);
-
- if (reserved >= hugetlb_cma_size)
- break;
- }
-
- if (!reserved)
- /*
- * hugetlb_cma_size is used to determine if allocations from
- * cma are possible. Set to zero if no cma regions are set up.
- */
- hugetlb_cma_size = 0;
-}
-
-static void __init hugetlb_cma_check(void)
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
{
- if (!hugetlb_cma_size || cma_reserve_called)
- return;
-
- pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
}
-
-#endif /* CONFIG_CMA */
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index e716c4671a15..58e895f3899a 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -101,10 +101,9 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
- struct page_counter *fault_parent = NULL;
- struct page_counter *rsvd_parent = NULL;
+ struct page_counter *fault, *fault_parent = NULL;
+ struct page_counter *rsvd, *rsvd_parent = NULL;
unsigned long limit;
- int ret;
if (parent_h_cgroup) {
fault_parent = hugetlb_cgroup_counter_from_cgroup(
@@ -112,24 +111,22 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
parent_h_cgroup, idx);
}
- page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
- idx),
- fault_parent, false);
- page_counter_init(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- rsvd_parent, false);
+ fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
+ rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
+
+ page_counter_init(fault, fault_parent, false);
+ page_counter_init(rsvd, rsvd_parent, false);
+
+ if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
+ fault->track_failcnt = true;
+ rsvd->track_failcnt = true;
+ }
limit = round_down(PAGE_COUNTER_MAX,
pages_per_huge_page(&hstates[idx]));
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
- limit);
- VM_BUG_ON(ret);
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- limit);
- VM_BUG_ON(ret);
+ VM_BUG_ON(page_counter_set_max(fault, limit));
+ VM_BUG_ON(page_counter_set_max(rsvd, limit));
}
}
@@ -195,24 +192,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
* cannot fail.
*/
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
unsigned int nr_pages;
struct page_counter *counter;
- struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
- struct folio *folio = page_folio(page);
- page_hcg = hugetlb_cgroup_from_folio(folio);
+ hcg = hugetlb_cgroup_from_folio(folio);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
* ignore those pages.
*/
- if (!page_hcg || page_hcg != h_cg)
+ if (!hcg || hcg != h_cg)
goto out;
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
@@ -235,13 +231,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
- struct page *page;
+ struct folio *folio;
do {
for_each_hstate(h) {
spin_lock_irq(&hugetlb_lock);
- list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
+ list_for_each_entry(folio, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
spin_unlock_irq(&hugetlb_lock);
}
@@ -917,7 +913,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
list_move(&new_folio->lru, &h->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
- return;
}
static struct cftype hugetlb_files[] = {
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
new file mode 100644
index 000000000000..f58ef4969e7a
--- /dev/null
+++ b/mm/hugetlb_cma.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/cma.h>
+#include <linux/compiler.h>
+#include <linux/mm_inline.h>
+
+#include <asm/page.h>
+#include <asm/setup.h>
+
+#include <linux/hugetlb.h>
+#include "internal.h"
+#include "hugetlb_cma.h"
+
+
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_only;
+static unsigned long hugetlb_cma_size __initdata;
+
+void hugetlb_cma_free_folio(struct folio *folio)
+{
+ int nid = folio_nid(folio);
+
+ WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio));
+}
+
+
+struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ int node;
+ int order = huge_page_order(h);
+ struct folio *folio = NULL;
+
+ if (hugetlb_cma[nid])
+ folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
+
+ if (!folio && !(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
+
+ folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
+ if (folio)
+ break;
+ }
+ }
+
+ if (folio)
+ folio_set_hugetlb_cma(folio);
+
+ return folio;
+}
+
+struct huge_bootmem_page * __init
+hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
+{
+ struct cma *cma;
+ struct huge_bootmem_page *m;
+ int node = *nid;
+
+ cma = hugetlb_cma[*nid];
+ m = cma_reserve_early(cma, huge_page_size(h));
+ if (!m) {
+ if (node_exact)
+ return NULL;
+
+ for_each_node_mask(node, hugetlb_bootmem_nodes) {
+ cma = hugetlb_cma[node];
+ if (!cma || node == *nid)
+ continue;
+ m = cma_reserve_early(cma, huge_page_size(h));
+ if (m) {
+ *nid = node;
+ break;
+ }
+ }
+ }
+
+ if (m) {
+ m->flags = HUGE_BOOTMEM_CMA;
+ m->cma = cma;
+ }
+
+ return m;
+}
+
+
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ hugetlb_cma_size_in_node[nid] = tmp;
+ hugetlb_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ hugetlb_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+static int __init cmdline_parse_hugetlb_cma_only(char *p)
+{
+ return kstrtobool(p, &hugetlb_cma_only);
+}
+
+early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only);
+
+void __init hugetlb_cma_reserve(int order)
+{
+ unsigned long size, reserved, per_node;
+ bool node_specific_cma_alloc = false;
+ int nid;
+
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ VM_WARN_ON(order <= MAX_PAGE_ORDER);
+ cma_reserve_called = true;
+
+ if (!hugetlb_cma_size)
+ return;
+
+ hugetlb_bootmem_set_nodes();
+
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_isset(nid, hugetlb_bootmem_nodes)) {
+ pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+ nid, (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!hugetlb_cma_size)
+ return;
+
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+ (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size = 0;
+ return;
+ }
+
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size,
+ nodes_weight(hugetlb_bootmem_nodes));
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ }
+
+ reserved = 0;
+ for_each_node_mask(nid, hugetlb_bootmem_nodes) {
+ int res;
+ char name[CMA_MAX_NAME];
+
+ if (node_specific_cma_alloc) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = hugetlb_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, hugetlb_cma_size - reserved);
+ }
+
+ size = round_up(size, PAGE_SIZE << order);
+
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_multi(size, PAGE_SIZE << order,
+ HUGETLB_PAGE_ORDER, name,
+ &hugetlb_cma[nid], nid);
+ if (res) {
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= hugetlb_cma_size)
+ break;
+ }
+
+ if (!reserved)
+ /*
+ * hugetlb_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ hugetlb_cma_size = 0;
+}
+
+void __init hugetlb_cma_check(void)
+{
+ if (!hugetlb_cma_size || cma_reserve_called)
+ return;
+
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+bool hugetlb_cma_exclusive_alloc(void)
+{
+ return hugetlb_cma_only;
+}
+
+unsigned long __init hugetlb_cma_total_size(void)
+{
+ return hugetlb_cma_size;
+}
+
+void __init hugetlb_cma_validate_params(void)
+{
+ if (!hugetlb_cma_size)
+ hugetlb_cma_only = false;
+}
+
+bool __init hugetlb_early_cma(struct hstate *h)
+{
+ if (arch_has_huge_bootmem_alloc())
+ return false;
+
+ return hstate_is_gigantic(h) && hugetlb_cma_only;
+}
diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h
new file mode 100644
index 000000000000..f7d7fb9880a2
--- /dev/null
+++ b/mm/hugetlb_cma.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HUGETLB_CMA_H
+#define _LINUX_HUGETLB_CMA_H
+
+#ifdef CONFIG_CMA
+void hugetlb_cma_free_folio(struct folio *folio);
+struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
+struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
+ bool node_exact);
+void hugetlb_cma_check(void);
+bool hugetlb_cma_exclusive_alloc(void);
+unsigned long hugetlb_cma_total_size(void);
+void hugetlb_cma_validate_params(void);
+bool hugetlb_early_cma(struct hstate *h);
+#else
+static inline void hugetlb_cma_free_folio(struct folio *folio)
+{
+}
+
+static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+
+static inline
+struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
+ bool node_exact)
+{
+ return NULL;
+}
+
+static inline void hugetlb_cma_check(void)
+{
+}
+
+static inline bool hugetlb_cma_exclusive_alloc(void)
+{
+ return false;
+}
+
+static inline unsigned long hugetlb_cma_total_size(void)
+{
+ return 0;
+}
+
+static inline void hugetlb_cma_validate_params(void)
+{
+}
+
+static inline bool hugetlb_early_cma(struct hstate *h)
+{
+ return false;
+}
+#endif
+#endif
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 57b7f591eee8..ba0fb1b6a5a8 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
VM_BUG_ON(!PAGE_ALIGNED(start | end));
mmap_read_lock(&init_mm);
- ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
+ ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
NULL, walk);
mmap_read_unlock(&init_mm);
if (ret)
@@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* struct page, the special metadata (e.g. page->flags or page->mapping)
* cannot copy to the tail struct page structs. The invalid value will be
* checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
+ * of "corrupted mapping in tail page". We need to reset at least 4 (one
+ * head struct page struct and three tail struct page structs) struct page
* structs.
*/
-#define NR_RESET_STRUCT_PAGE 3
+#define NR_RESET_STRUCT_PAGE 4
static inline void reset_struct_pages(struct page *start)
{
@@ -444,7 +444,11 @@ DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
-core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
+static int __init hugetlb_vmemmap_optimize_param(char *buf)
+{
+ return kstrtobool(buf, &vmemmap_optimize_enabled);
+}
+early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
struct folio *folio, unsigned long flags)
@@ -645,14 +649,39 @@ static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *fol
return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
}
-void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
+ struct list_head *folio_list,
+ bool boot)
{
struct folio *folio;
+ int nr_to_optimize;
LIST_HEAD(vmemmap_pages);
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
+ nr_to_optimize = 0;
list_for_each_entry(folio, folio_list, lru) {
- int ret = hugetlb_vmemmap_split_folio(h, folio);
+ int ret;
+ unsigned long spfn, epfn;
+
+ if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
+ /*
+ * Already optimized by pre-HVO, just map the
+ * mirrored tail page structs RO.
+ */
+ spfn = (unsigned long)&folio->page;
+ epfn = spfn + pages_per_huge_page(h);
+ vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ register_page_bootmem_memmap(pfn_to_section_nr(spfn),
+ &folio->page,
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ continue;
+ }
+
+ nr_to_optimize++;
+
+ ret = hugetlb_vmemmap_split_folio(h, folio);
/*
* Spliting the PMD requires allocating a page, thus lets fail
@@ -664,6 +693,16 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
break;
}
+ if (!nr_to_optimize)
+ /*
+ * All pre-HVO folios, nothing left to do. It's ok if
+ * there is a mix of pre-HVO and not yet HVO-ed folios
+ * here, as __hugetlb_vmemmap_optimize_folio() will
+ * skip any folios that already have the optimized flag
+ * set, see vmemmap_should_optimize_folio().
+ */
+ goto out;
+
flush_tlb_all();
list_for_each_entry(folio, folio_list, lru) {
@@ -689,11 +728,165 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
}
}
+out:
flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages);
}
-static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+{
+ __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
+}
+
+void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
+{
+ __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+
+/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
+static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
+{
+ unsigned long section_size, psize, pmd_vmemmap_size;
+ phys_addr_t paddr;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return false;
+
+ if (!hugetlb_vmemmap_optimizable(m->hstate))
+ return false;
+
+ psize = huge_page_size(m->hstate);
+ paddr = virt_to_phys(m);
+
+ /*
+ * Pre-HVO only works if the bootmem huge page
+ * is aligned to the section size.
+ */
+ section_size = (1UL << PA_SECTION_SHIFT);
+ if (!IS_ALIGNED(paddr, section_size) ||
+ !IS_ALIGNED(psize, section_size))
+ return false;
+
+ /*
+ * The pre-HVO code does not deal with splitting PMDS,
+ * so the bootmem page must be aligned to the number
+ * of base pages that can be mapped with one vmemmap PMD.
+ */
+ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
+ if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
+ !IS_ALIGNED(psize, pmd_vmemmap_size))
+ return false;
+
+ return true;
+}
+
+/*
+ * Initialize memmap section for a gigantic page, HVO-style.
+ */
+void __init hugetlb_vmemmap_init_early(int nid)
+{
+ unsigned long psize, paddr, section_size;
+ unsigned long ns, i, pnum, pfn, nr_pages;
+ unsigned long start, end;
+ struct huge_bootmem_page *m = NULL;
+ void *map;
+
+ /*
+ * Noting to do if bootmem pages were not allocated
+ * early in boot, or if HVO wasn't enabled in the
+ * first place.
+ */
+ if (!hugetlb_bootmem_allocated())
+ return;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return;
+
+ section_size = (1UL << PA_SECTION_SHIFT);
+
+ list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ if (!vmemmap_should_optimize_bootmem_page(m))
+ continue;
+
+ nr_pages = pages_per_huge_page(m->hstate);
+ psize = nr_pages << PAGE_SHIFT;
+ paddr = virt_to_phys(m);
+ pfn = PHYS_PFN(paddr);
+ map = pfn_to_page(pfn);
+ start = (unsigned long)map;
+ end = start + nr_pages * sizeof(struct page);
+
+ if (vmemmap_populate_hvo(start, end, nid,
+ HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
+ continue;
+
+ memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
+
+ pnum = pfn_to_section_nr(pfn);
+ ns = psize / section_size;
+
+ for (i = 0; i < ns; i++) {
+ sparse_init_early_section(nid, map, pnum,
+ SECTION_IS_VMEMMAP_PREINIT);
+ map += section_map_size();
+ pnum++;
+ }
+
+ m->flags |= HUGE_BOOTMEM_HVO;
+ }
+}
+
+void __init hugetlb_vmemmap_init_late(int nid)
+{
+ struct huge_bootmem_page *m, *tm;
+ unsigned long phys, nr_pages, start, end;
+ unsigned long pfn, nr_mmap;
+ struct hstate *h;
+ void *map;
+
+ if (!hugetlb_bootmem_allocated())
+ return;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return;
+
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
+ if (!(m->flags & HUGE_BOOTMEM_HVO))
+ continue;
+
+ phys = virt_to_phys(m);
+ h = m->hstate;
+ pfn = PHYS_PFN(phys);
+ nr_pages = pages_per_huge_page(h);
+
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Oops, the hugetlb page spans multiple zones.
+ * Remove it from the list, and undo HVO.
+ */
+ list_del(&m->list);
+
+ map = pfn_to_page(pfn);
+
+ start = (unsigned long)map;
+ end = start + nr_pages * sizeof(struct page);
+
+ vmemmap_undo_hvo(start, end, nid,
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
+ memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
+
+ memblock_phys_free(phys, huge_page_size(h));
+ continue;
+ } else
+ m->flags |= HUGE_BOOTMEM_ZONES_VALID;
+ }
+}
+#endif
+
+static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
{
.procname = "hugetlb_optimize_vmemmap",
.data = &vmemmap_optimize_enabled,
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 2fcae92d3359..18b490825215 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -9,6 +9,8 @@
#ifndef _LINUX_HUGETLB_VMEMMAP_H
#define _LINUX_HUGETLB_VMEMMAP_H
#include <linux/hugetlb.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
/*
* Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
@@ -24,6 +26,12 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct list_head *non_hvo_folios);
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
+void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+void hugetlb_vmemmap_init_early(int nid);
+void hugetlb_vmemmap_init_late(int nid);
+#endif
+
static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
{
@@ -48,7 +56,7 @@ static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct f
return 0;
}
-static long hugetlb_vmemmap_restore_folios(const struct hstate *h,
+static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct list_head *folio_list,
struct list_head *non_hvo_folios)
{
@@ -64,6 +72,19 @@ static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list
{
}
+static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+}
+
+static inline void hugetlb_vmemmap_init_early(int nid)
+{
+}
+
+static inline void hugetlb_vmemmap_init_late(int nid)
+{
+}
+
static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
{
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..4600e7605cab 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,7 +40,8 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
- .mm_lock_seq = 0,
+ .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
+ .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
diff --git a/mm/internal.h b/mm/internal.h
index c9186ca8d7c2..45b725c3dc03 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -25,6 +25,47 @@
struct folio_batch;
/*
+ * Maintains state across a page table move. The operation assumes both source
+ * and destination VMAs already exist and are specified by the user.
+ *
+ * Partial moves are permitted, but the old and new ranges must both reside
+ * within a VMA.
+ *
+ * mmap lock must be held in write and VMA write locks must be held on any VMA
+ * that is visible.
+ *
+ * Use the PAGETABLE_MOVE() macro to initialise this struct.
+ *
+ * The old_addr and new_addr fields are updated as the page table move is
+ * executed.
+ *
+ * NOTE: The page table move is affected by reading from [old_addr, old_end),
+ * and old_addr may be updated for better page table alignment, so len_in
+ * represents the length of the range being copied as specified by the user.
+ */
+struct pagetable_move_control {
+ struct vm_area_struct *old; /* Source VMA. */
+ struct vm_area_struct *new; /* Destination VMA. */
+ unsigned long old_addr; /* Address from which the move begins. */
+ unsigned long old_end; /* Exclusive address at which old range ends. */
+ unsigned long new_addr; /* Address to move page tables to. */
+ unsigned long len_in; /* Bytes to remap specified by user. */
+
+ bool need_rmap_locks; /* Do rmap locks need to be taken? */
+ bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
+ struct pagetable_move_control name = { \
+ .old = old_, \
+ .new = new_, \
+ .old_addr = old_addr_, \
+ .old_end = (old_addr_) + (len_), \
+ .new_addr = new_addr_, \
+ .len_in = len_, \
+ }
+
+/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
* about IO, FS and watermark checking while ignoring placement
@@ -84,6 +125,8 @@ void page_writeback_init(void);
*/
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ return -1;
return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}
@@ -106,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
- return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+ return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
}
/*
@@ -121,7 +164,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
*/
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
- int err = call_mmap(file, vma);
+ int err = vfs_mmap(file, vma);
if (likely(!err))
return 0;
@@ -159,109 +202,126 @@ static inline void vma_close(struct vm_area_struct *vma)
/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;
-/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
-#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
+/* Compare PTEs respecting the dirty bit. */
+#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0))
+
+/* Compare PTEs respecting the soft-dirty bit. */
+#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1))
-/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
-#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
+/* Compare PTEs respecting the writable bit. */
+#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2))
+
+/*
+ * Merge PTE write bits: if any PTE in the batch is writable, modify the
+ * PTE at @ptentp to be writable.
+ */
+#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3))
+
+/*
+ * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
+ * modify the PTE at @ptentp to be young or dirty, respectively.
+ */
+#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4))
static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
- if (flags & FPB_IGNORE_DIRTY)
+ if (!(flags & FPB_RESPECT_DIRTY))
pte = pte_mkclean(pte);
- if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+ if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
pte = pte_clear_soft_dirty(pte);
- return pte_wrprotect(pte_mkold(pte));
+ if (likely(!(flags & FPB_RESPECT_WRITE)))
+ pte = pte_wrprotect(pte);
+ return pte_mkold(pte);
}
/**
- * folio_pte_batch - detect a PTE batch for a large folio
+ * folio_pte_batch_flags - detect a PTE batch for a large folio
* @folio: The large folio to detect a PTE batch for.
- * @addr: The user virtual address the first page is mapped at.
- * @start_ptep: Page table pointer for the first entry.
- * @pte: Page table entry for the first page.
+ * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
+ * @ptep: Page table pointer for the first entry.
+ * @ptentp: Pointer to a COPY of the first page table entry whose flags this
+ * function updates based on @flags if appropriate.
* @max_nr: The maximum number of table entries to consider.
* @flags: Flags to modify the PTE batch semantics.
- * @any_writable: Optional pointer to indicate whether any entry except the
- * first one is writable.
- * @any_young: Optional pointer to indicate whether any entry except the
- * first one is young.
- * @any_dirty: Optional pointer to indicate whether any entry except the
- * first one is dirty.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
- * pages of the same large folio.
+ * pages of the same large folio in a single VMA and a single page table.
*
* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
- * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
- * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
+ * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
+ *
+ * @ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single VMA and
+ * a single page table.
+ *
+ * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
+ * be updated: it's crucial that a pointer to a COPY of the first
+ * page table entry, obtained through ptep_get(), is provided as @ptentp.
*
- * start_ptep must map any page of the folio. max_nr must be at least one and
- * must be limited by the caller so scanning cannot exceed a single page table.
+ * This function will be inlined to optimize based on the input parameters;
+ * consider using folio_pte_batch() instead if applicable.
*
* Return: the number of table entries in the batch.
*/
-static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
- pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
- bool *any_writable, bool *any_young, bool *any_dirty)
+static inline unsigned int folio_pte_batch_flags(struct folio *folio,
+ struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
+ unsigned int max_nr, fpb_t flags)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
- pte_t expected_pte, *ptep;
- bool writable, young, dirty;
- int nr;
-
- if (any_writable)
- *any_writable = false;
- if (any_young)
- *any_young = false;
- if (any_dirty)
- *any_dirty = false;
+ bool any_writable = false, any_young = false, any_dirty = false;
+ pte_t expected_pte, pte = *ptentp;
+ unsigned int nr, cur_nr;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /*
+ * Ensure this is a pointer to a copy not a pointer into a page table.
+ * If this is a stack value, it won't be a valid virtual address, but
+ * that's fine because it also cannot be pointing into the page table.
+ */
+ VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));
+
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
- nr = pte_batch_hint(start_ptep, pte);
+ nr = pte_batch_hint(ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
- ptep = start_ptep + nr;
+ ptep = ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
- if (any_writable)
- writable = !!pte_write(pte);
- if (any_young)
- young = !!pte_young(pte);
- if (any_dirty)
- dirty = !!pte_dirty(pte);
- pte = __pte_batch_clear_ignored(pte, flags);
- if (!pte_same(pte, expected_pte))
- break;
-
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
+ if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
break;
- if (any_writable)
- *any_writable |= writable;
- if (any_young)
- *any_young |= young;
- if (any_dirty)
- *any_dirty |= dirty;
-
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ if (flags & FPB_MERGE_WRITE)
+ any_writable |= pte_write(pte);
+ if (flags & FPB_MERGE_YOUNG_DIRTY) {
+ any_young |= pte_young(pte);
+ any_dirty |= pte_dirty(pte);
+ }
+
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ if (any_writable)
+ *ptentp = pte_mkwrite(*ptentp, vma);
+ if (any_young)
+ *ptentp = pte_mkyoung(*ptentp);
+ if (any_dirty)
+ *ptentp = pte_mkdirty(*ptentp);
+
+ return min(nr, max_nr);
}
+unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
+ unsigned int max_nr);
+
/**
* pte_move_swp_offset - Move the swap entry offset field of a swap pte
* forward or backward by delta
@@ -392,9 +452,13 @@ void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, struct zap_details *details);
+int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
+ gfp_t gfp);
-void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
- unsigned int order);
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
@@ -474,6 +538,16 @@ extern unsigned long highest_memmap_pfn;
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
+#ifdef CONFIG_NUMA
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat);
+#else
+static inline int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ return 0;
+}
+#endif
/*
* in mm/rmap.c:
@@ -491,6 +565,7 @@ extern char * const zone_names[MAX_NR_ZONES];
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
extern int min_free_kbytes;
+extern int defrag_mode;
void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
@@ -656,6 +731,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
void set_zone_contiguous(struct zone *zone);
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages);
static inline void clear_zone_contiguous(struct zone *zone)
{
@@ -680,8 +757,8 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
return;
folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
-#ifdef CONFIG_64BIT
- folio->_folio_nr_pages = 1U << order;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 1U << order;
#endif
}
@@ -717,9 +794,17 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
folio_set_order(folio, order);
atomic_set(&folio->_large_mapcount, -1);
- atomic_set(&folio->_entire_mapcount, -1);
- atomic_set(&folio->_nr_pages_mapped, 0);
- atomic_set(&folio->_pincount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ folio->_mm_ids = 0;
+ folio->_mm_id_mapcount[0] = -1;
+ folio->_mm_id_mapcount[1] = -1;
+ }
+ if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
+ atomic_set(&folio->_pincount, 0);
+ atomic_set(&folio->_entire_mapcount, -1);
+ }
if (order > 1)
INIT_LIST_HEAD(&folio->_deferred_list);
}
@@ -733,17 +818,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx)
set_page_private(p, 0);
}
-extern void prep_compound_page(struct page *page, unsigned int order);
-
-extern void post_alloc_hook(struct page *page, unsigned int order,
- gfp_t gfp_flags);
+void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);
extern int user_min_free_kbytes;
-void free_unref_page(struct page *page, unsigned int order);
+struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
+ nodemask_t *);
+#define __alloc_frozen_pages(...) \
+ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
+void free_frozen_pages(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);
+#ifdef CONFIG_NUMA
+struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
+#else
+static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
+{
+ return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
+}
+#endif
+
+#define alloc_frozen_pages(...) \
+ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
+
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -754,7 +852,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
int nid, bool exact_nid);
void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
- unsigned long, enum meminit_context, struct vmem_altmap *, int);
+ unsigned long, enum meminit_context, struct vmem_altmap *, int,
+ bool);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -824,17 +923,29 @@ int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype);
-
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+struct cma;
+
+#ifdef CONFIG_CMA
+void *cma_reserve_early(struct cma *cma, unsigned long size);
+void init_cma_pageblock(struct page *page);
+#else
+static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
+{
+ return NULL;
+}
+static inline void init_cma_pageblock(struct page *page)
+{
+}
+#endif
+
+
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal);
+ int migratetype, bool claimable);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
@@ -850,7 +961,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
-extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes);
/*
@@ -1040,6 +1151,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages);
bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void init_deferred_page(unsigned long pfn, int nid);
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -1084,9 +1197,13 @@ static inline void mminit_verify_zonelist(void)
#define NODE_RECLAIM_SUCCESS 1
#ifdef CONFIG_NUMA
+extern int node_reclaim_mode;
+
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
+#define node_reclaim_mode 0
+
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
@@ -1098,6 +1215,12 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
}
#endif
+static inline bool node_reclaim_enabled(void)
+{
+ /* Is any node_reclaim_mode bit set? */
+ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
/*
* mm/memory-failure.c
*/
@@ -1136,7 +1259,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *folio_list);
@@ -1175,6 +1297,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
/* Flags that allow allocations below the min watermark. */
@@ -1268,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift,
- unsigned long flags, unsigned long start,
+ unsigned long vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask,
const void *caller);
@@ -1395,7 +1518,8 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
}
extern bool mirrored_kernelcore;
-extern bool memblock_has_mirror(void);
+bool memblock_has_mirror(void);
+void memblock_free_all(void);
static __always_inline void vma_set_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
@@ -1436,27 +1560,12 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte
void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid);
+void __meminit __init_page_from_nid(unsigned long pfn, int nid);
/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
int priority);
-#ifdef CONFIG_64BIT
-static inline int can_do_mseal(unsigned long flags)
-{
- if (flags)
- return -EINVAL;
-
- return 0;
-}
-
-#else
-static inline int can_do_mseal(unsigned long flags)
-{
- return -EPERM;
-}
-#endif
-
#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
struct shrinker *shrinker, const char *fmt, va_list ap)
@@ -1513,10 +1622,7 @@ extern struct list_lru shadow_nodes;
} while (0)
/* mremap.c */
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack);
+unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
@@ -1530,5 +1636,29 @@ static inline void accept_page(struct page *page)
int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd, void *private);
+
+/* pt_reclaim.c */
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval);
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb);
+
+#ifdef CONFIG_PT_RECLAIM
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details);
+#else
+static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return false;
+}
+#endif /* CONFIG_PT_RECLAIM */
+
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
deleted file mode 100644
index 01b362799930..000000000000
--- a/mm/io-mapping.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/mm.h>
-#include <linux/io-mapping.h>
-
-/**
- * io_mapping_map_user - remap an I/O mapping to userspace
- * @iomap: the source io_mapping
- * @vma: user vma to map to
- * @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
- *
- * Note: this is only safe if the mm semaphore is held when called.
- */
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size)
-{
- vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-
- if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
- return -EINVAL;
-
- /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
- return remap_pfn_range_notrack(vma, addr, pfn, size,
- __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
-}
-EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3e049dfb28bd..c36dd9f62fd5 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -50,9 +50,9 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
#ifndef ioremap_prot
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
+ pgprot_t prot)
{
- return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
+ return generic_ioremap_prot(phys_addr, size, prot);
}
EXPORT_SYMBOL(ioremap_prot);
#endif
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1a958e7c8a46..dd93ae8a6beb 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN)
ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
# we need to treat them normally (as builtins), otherwise the compiler won't
@@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
CFLAGS_KASAN_TEST += -fno-builtin
endif
+CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1)
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ed4873e18c75..9142964ab9c9 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object,
}
static inline void poison_slab_object(struct kmem_cache *cache, void *object,
- bool init, bool still_accessible)
+ bool init)
{
void *tagged_object = object;
object = kasan_reset_tag(object);
- /* RCU slabs could be legally used after free within the RCU period. */
- if (unlikely(still_accessible))
- return;
-
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init);
@@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
if (!kasan_arch_is_ready() || is_kfence_address(object))
return false;
- poison_slab_object(cache, object, init, still_accessible);
+ /*
+ * If this point is reached with an object that must still be
+ * accessible under RCU, we can't poison it; in that case, also skip the
+ * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG
+ * has been disabled manually.
+ *
+ * Putting the object on the quarantine wouldn't help catch UAFs (since
+ * we can't poison it here), and it would mask bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users not being careful enough about object
+ * reuse; so overall, putting the object into the quarantine here would
+ * be counterproductive.
+ */
+ if (still_accessible)
+ return false;
+
+ poison_slab_object(cache, object, init);
/*
* If the object is put into quarantine, do not let slab put the object
@@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;
- poison_slab_object(slab->slab_cache, ptr, false, false);
+ poison_slab_object(slab->slab_cache, ptr, false);
return true;
}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 8b9e348113b1..d54e89f8c3e7 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
sizeof(struct kasan_free_meta) : 0);
}
-static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
+/*
+ * This function avoids dynamic memory allocations and thus can be called from
+ * contexts that do not allow allocating memory.
+ */
+void kasan_record_aux_stack(void *addr)
{
struct slab *slab = kasan_addr_to_slab(addr);
struct kmem_cache *cache;
@@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
-}
-
-void kasan_record_aux_stack(void *addr)
-{
- return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
-}
-
-void kasan_record_aux_stack_noalloc(void *addr)
-{
- return __kasan_record_aux_stack(addr, 0);
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, 0);
}
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index ccd66c7a4081..9a6927394b54 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/static_key.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
@@ -263,8 +264,8 @@ void __init kasan_init_hw_tags(void)
pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
kasan_mode_info(),
- kasan_vmalloc_enabled() ? "on" : "off",
- kasan_stack_collection_enabled() ? "on" : "off");
+ str_on_off(kasan_vmalloc_enabled()),
+ str_on_off(kasan_stack_collection_enabled()));
}
#ifdef CONFIG_KASAN_VMALLOC
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ced6b29fcf76..8fce3370c84e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -13,9 +13,9 @@
#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/slab.h>
+#include <linux/pgalloc.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include "kasan.h"
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
pud_t *pud;
pmd_t *pmd;
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
} else {
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
}
zero_pud_populate(p4d, addr, next);
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
*/
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
lm_alias(kasan_early_shadow_p4d));
p4d = p4d_offset(pgd, addr);
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
if (!p)
return -ENOMEM;
} else {
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
early_alloc(PAGE_SIZE, NUMA_NO_NODE));
}
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index b7e4b81421b3..129178be5e64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr)
/**
* kasan_poison - mark the memory range as inaccessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size, must be aligned to KASAN_GRANULE_SIZE
- * @value - value that's written to metadata for the range
- * @init - whether to initialize the memory range (only for hardware tag-based)
+ * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size, must be aligned to KASAN_GRANULE_SIZE
+ * @value: value that's written to metadata for the range
+ * @init: whether to initialize the memory range (only for hardware tag-based)
*/
void kasan_poison(const void *addr, size_t size, u8 value, bool init);
/**
* kasan_unpoison - mark the memory range as accessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size, can be unaligned
- * @init - whether to initialize the memory range (only for hardware tag-based)
+ * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size, can be unaligned
+ * @init: whether to initialize the memory range (only for hardware tag-based)
*
* For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
* marking the range.
@@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr);
/**
* kasan_poison_last_granule - mark the last granule of the memory range as
* inaccessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size
+ * @address: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size
*
* This function is only available for the generic mode, as it's the only mode
* that has partially poisoned memory granules.
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 99d4ff0ed57a..f4b17984b627 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -47,8 +47,8 @@ static struct {
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
*/
-void *kasan_ptr_result;
-int kasan_int_result;
+static void *volatile kasan_ptr_result;
+static volatile int kasan_int_result;
/* Probe for console output: obtains test_status lines of interest. */
static void probe_console(void *ignore, const char *buf, size_t len)
@@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
-static void empty_cache_ctor(void *object) { }
-
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
- /* Provide a constructor to prevent cache merging. */
- cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
kmem_cache_destroy(cache);
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
@@ -1570,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test)
static void kasan_strings(struct kunit *test)
{
char *ptr;
+ char *src;
size_t size = 24;
/*
@@ -1580,7 +1578,28 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ OPTIMIZER_HIDE_VAR(ptr);
+
+ src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+ OPTIMIZER_HIDE_VAR(src);
+ /*
+ * Make sure that strscpy() does not trigger KASAN if it overreads into
+ * poisoned memory.
+ *
+ * The expected size does not include the terminator '\0'
+ * so it is (KASAN_GRANULE_SIZE - 2) ==
+ * KASAN_GRANULE_SIZE - ("initial removed character" + "\0").
+ */
+ KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2,
+ strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
+
+ /* strscpy should fail if the first byte is unreadable. */
+ KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KASAN_GRANULE_SIZE));
+
+ kfree(src);
kfree(ptr);
/*
@@ -1960,6 +1979,11 @@ static void rust_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf());
}
+/*
+ * copy_to_kernel_nofault() is an internal helper available when
+ * kasan_test is built-in, so it must not be visible to loadable modules.
+ */
+#ifndef MODULE
static void copy_to_kernel_nofault_oob(struct kunit *test)
{
char *ptr;
@@ -1994,6 +2018,7 @@ static void copy_to_kernel_nofault_oob(struct kunit *test)
kfree(ptr);
}
+#endif /* !MODULE */
static void copy_user_test_oob(struct kunit *test)
{
@@ -2114,7 +2139,9 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
+#ifndef MODULE
KUNIT_CASE(copy_to_kernel_nofault_oob),
+#endif
KUNIT_CASE(rust_uaf),
KUNIT_CASE(copy_user_test_oob),
{}
@@ -2130,4 +2157,5 @@ static struct kunit_suite kasan_kunit_test_suite = {
kunit_test_suite(kasan_kunit_test_suite);
+MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities");
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3fe77a360f1c..62c01b4527eb 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -399,17 +399,10 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = find_vm_area(addr);
-
- if (va) {
- pr_err("The buggy address belongs to the virtual mapping at\n"
- " [%px, %px) created by:\n"
- " %pS\n",
- va->addr, va->addr + va->size, va->caller);
- pr_err("\n");
-
- page = vmalloc_to_page(addr);
- }
+ pr_err("The buggy address belongs to a");
+ if (!vmalloc_dump_obj(addr))
+ pr_cont(" vmalloc virtual mapping\n");
+ page = vmalloc_to_page(addr);
}
if (page) {
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..11d472a5c4e8 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -292,34 +292,118 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
- if (likely(!pte_none(ptep_get(ptep))))
- return 0;
-
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ arch_leave_lazy_mmu_mode();
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ arch_enter_lazy_mmu_mode();
+
return 0;
}
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
+ return 0;
+}
+
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ unsigned int flags;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask);
+ if (ret)
+ break;
+
+ data.start = start;
+
+ /*
+ * page tables allocations ignore external gfp mask, enforce it
+ * by the scope API
+ */
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ memalloc_nofs_restore(flags);
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ memalloc_noio_restore(flags);
+
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
{
unsigned long shadow_start, shadow_end;
int ret;
@@ -348,9 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask);
if (ret)
return ret;
@@ -397,18 +479,23 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
- unsigned long page;
+ pte_t pte;
+ int none;
- page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
+ arch_leave_lazy_mmu_mode();
spin_lock(&init_mm.page_table_lock);
-
- if (likely(!pte_none(ptep_get(ptep)))) {
+ pte = ptep_get(ptep);
+ none = pte_none(pte);
+ if (likely(!none))
pte_clear(&init_mm, addr, ptep);
- free_page(page);
- }
spin_unlock(&init_mm.page_table_lock);
+ if (likely(!none))
+ __free_page(pfn_to_page(pte_pfn(pte)));
+
+ arch_enter_lazy_mmu_mode();
+
return 0;
}
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 220b5d4c6876..b9382b5b6a37 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
@@ -45,7 +46,7 @@ void __init kasan_init_sw_tags(void)
kasan_init_tags();
pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
- kasan_stack_collection_enabled() ? "on" : "off");
+ str_on_off(kasan_stack_collection_enabled()));
}
/*
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 102048821c22..0ed3be100963 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -605,8 +605,8 @@ static unsigned long kfence_init_pool(void)
pages = virt_to_page(__kfence_pool);
/*
- * Set up object pages: they must have PG_slab set, to avoid freeing
- * these as real pages.
+ * Set up object pages: they must have PGTY_slab set to avoid freeing
+ * them as real pages.
*
* We also want to avoid inserting kfence_free() in the kfree()
* fast-path in SLUB, and therefore need to ensure kfree() correctly
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index f65fb182466d..00034e37bc9f 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/tracepoint.h>
#include <trace/events/printk.h>
@@ -88,7 +89,7 @@ struct expect_report {
static const char *get_access_type(const struct expect_report *r)
{
- return r->is_write ? "write" : "read";
+ return str_write_read(r->is_write);
}
/* Check observed report matches information in @r. */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 6370c5207d1a..10e6802a2edf 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -16,6 +16,7 @@
#include <linux/sprintf.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/sched/clock.h>
#include <trace/events/error_report.h>
@@ -184,7 +185,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
static const char *get_access_type(bool is_write)
{
- return is_write ? "write" : "read";
+ return str_write_read(is_write);
}
void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index bad1e130eda8..b486c1d19b2d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = {
#endif /* CONFIG_SYSFS */
int hugepage_madvise(struct vm_area_struct *vma,
- unsigned long *vm_flags, int advice)
+ vm_flags_t *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
@@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm)
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_pmd_enabled()) {
@@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@@ -607,7 +594,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
/* See hpage_collapse_scan_pmd(). */
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -696,13 +683,13 @@ next:
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
@@ -713,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@@ -735,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -948,30 +946,40 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-static int find_pmd_or_thp_or_none(struct mm_struct *mm,
- unsigned long address,
- pmd_t **pmd)
+static inline int check_pmd_state(pmd_t *pmd)
{
- pmd_t pmde;
-
- *pmd = mm_find_pmd(mm, address);
- if (!*pmd)
- return SCAN_PMD_NULL;
+ pmd_t pmde = pmdp_get_lockless(pmd);
- pmde = pmdp_get_lockless(*pmd);
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+
+ /*
+ * The folio may be under migration when khugepaged is trying to
+ * collapse it. Migration success or failure will eventually end
+ * up with a present PMD mapping a folio again.
+ */
+ if (is_pmd_migration_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
- if (pmd_devmap(pmde))
- return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
}
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ return check_pmd_state(*pmd);
+}
+
static int check_pmd_still_valid(struct mm_struct *mm,
unsigned long address,
pmd_t *pmd)
@@ -1164,11 +1172,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
+ vma_start_write(vma);
result = check_pmd_still_valid(mm, address, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
- vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1235,7 +1243,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1355,11 +1363,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
/*
* We treat a single page as shared if any part of the THP
- * is shared. "False negatives" from
- * folio_likely_mapped_shared() are not expected to matter
- * much in practice.
+ * is shared.
*/
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -1400,7 +1406,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1411,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, _address)))
referenced++;
}
if (!writable) {
@@ -1433,7 +1439,7 @@ out_unmap:
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
@@ -1462,10 +1468,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
struct vm_fault vmf = {
.vma = vma,
@@ -1474,13 +1479,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
.pmd = pmdp,
};
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1499,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1621,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1639,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
@@ -1687,14 +1700,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
unlock:
if (start_pte)
@@ -1721,7 +1734,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
pmd_t *pmd, pgt_pmd;
spinlock_t *pml;
spinlock_t *ptl;
- bool skipped_uffd = false;
+ bool success = false;
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1758,6 +1771,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
mmu_notifier_invalidate_range_start(&range);
pml = pmd_lock(mm, pmd);
+ /*
+ * The lock of new_folio is still held, we will be blocked in
+ * the page fault path, which prevents the pte entries from
+ * being set again. So even though the old empty PTE page may be
+ * concurrently freed and a new PTE page is filled into the pmd
+ * entry, it is still empty and can be removed.
+ *
+ * So here we only need to recheck if the state of pmd entry
+ * still meets our requirements, rather than checking pmd_same()
+ * like elsewhere.
+ */
+ if (check_pmd_state(pmd) != SCAN_SUCCEED)
+ goto drop_pml;
ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
@@ -1771,20 +1797,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* repeating the anon_vma check protects from one category,
* and repeating the userfaultfd_wp() check from another.
*/
- if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
- skipped_uffd = true;
- } else {
+ if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
+ success = true;
}
if (ptl != pml)
spin_unlock(ptl);
+drop_pml:
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
- if (!skipped_uffd) {
+ if (success) {
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pgt_pmd);
pte_free_defer(mm, pmd_pgtable(pgt_pmd));
@@ -2280,6 +2306,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2290,23 +2327,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2318,6 +2359,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2339,14 +2381,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2422,7 +2456,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2721,8 +2755,8 @@ static int madvise_collapse_errno(enum scan_result r)
}
}
-int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool *lock_dropped)
{
struct collapse_control *cc;
struct mm_struct *mm = vma->vm_mm;
@@ -2733,8 +2767,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- *prev = vma;
-
if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
return -EINVAL;
@@ -2768,7 +2800,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2782,7 +2814,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
&mmap_locked, cc);
}
if (!mmap_locked)
- *prev = NULL; /* Tell caller we dropped mmap_lock */
+ *lock_dropped = true;
handle_result:
switch (result) {
@@ -2792,7 +2824,6 @@ handle_result:
break;
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
- BUG_ON(*prev);
mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
mmap_read_unlock(mm);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bb7d61fc4da3..1ac56ceb29b6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -210,13 +210,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled = 1;
+static int kmemleak_enabled __read_mostly = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled = 1;
+static int kmemleak_free_enabled __read_mostly = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_late_initialized;
-/* set if a kmemleak warning was issued */
-static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static int kmemleak_error;
@@ -254,7 +252,6 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warn(x); \
dump_stack(); \
- kmemleak_warning = 1; \
} while (0)
/*
@@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq,
* sufficient references to it (count >= min_count)
* - black - ignore, it doesn't contain references (e.g. text section)
* (min_count == -1). No function defined for this color.
- * Newly created objects don't have any color assigned (object->count == -1)
- * before the next memory scan when they become white.
*/
static bool color_white(const struct kmemleak_object *object)
{
@@ -352,6 +347,15 @@ static bool unreferenced_object(struct kmemleak_object *object)
jiffies_last_scan);
}
+static const char *__object_type_str(struct kmemleak_object *object)
+{
+ if (object->flags & OBJECT_PHYS)
+ return " (phys)";
+ if (object->flags & OBJECT_PERCPU)
+ return " (percpu)";
+ return "";
+}
+
/*
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
@@ -364,8 +368,9 @@ static void print_unreferenced(struct seq_file *seq,
unsigned int nr_entries;
nr_entries = stack_depot_fetch(object->trace_handle, &entries);
- warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ warn_or_seq_printf(seq, "unreferenced object%s 0x%08lx (size %zu):\n",
+ __object_type_str(object),
+ object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
hex_dump_object(seq, object);
@@ -384,10 +389,10 @@ static void print_unreferenced(struct seq_file *seq,
*/
static void dump_object_info(struct kmemleak_object *object)
{
- pr_notice("Object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ pr_notice("Object%s 0x%08lx (size %zu):\n",
+ __object_type_str(object), object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
- object->comm, object->pid, object->jiffies);
+ object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%x\n", object->flags);
@@ -432,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
else if (untagged_objp == untagged_ptr || alias)
return object;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
+ printk_deferred_exit();
break;
}
}
@@ -465,6 +476,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
+ bool warn = false;
/* try the slab allocator first */
if (object_cache) {
@@ -483,8 +495,10 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
else if (mem_pool_free_count)
object = &mem_pool[--mem_pool_free_count];
else
- pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ warn = true;
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+ if (warn)
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
return object;
}
@@ -728,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
@@ -735,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
+ printk_deferred_exit();
return -EEXIST;
}
}
@@ -848,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size,
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = __find_and_remove_object(ptr, 1, objflags);
- if (!object) {
-#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
- ptr, size);
-#endif
+ if (!object)
goto unlock;
- }
/*
* Create one or two objects that may result from the memory block
@@ -874,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size,
unlock:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- if (object)
+ if (object) {
__delete_object(object);
+ } else {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
+#endif
+ }
out:
if (object_l)
@@ -1242,6 +1263,20 @@ void __ref kmemleak_transient_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_transient_leak);
/**
+ * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu
+ * address argument
+ * @ptr: percpu address of the object
+ */
+void __ref kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+ make_black_object((unsigned long)ptr, OBJECT_PERCPU);
+}
+EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
@@ -1855,7 +1890,7 @@ static int kmemleak_scan_thread(void *arg)
* Wait before the first scan to allow the system to fully initialize.
*/
if (first_run) {
- signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
+ signed long timeout = secs_to_jiffies(SECS_FIRST_SCAN);
first_run = 0;
while (timeout && !kthread_should_stop())
timeout = schedule_timeout_interruptible(timeout);
@@ -1998,25 +2033,41 @@ static int kmemleak_open(struct inode *inode, struct file *file)
return seq_open(file, &kmemleak_seq_ops);
}
-static int dump_str_object_info(const char *str)
+static bool __dump_str_object_info(unsigned long addr, unsigned int objflags)
{
unsigned long flags;
struct kmemleak_object *object;
+
+ object = __find_and_get_object(addr, 1, objflags);
+ if (!object)
+ return false;
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ dump_object_info(object);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+
+ return true;
+}
+
+static int dump_str_object_info(const char *str)
+{
unsigned long addr;
+ bool found = false;
if (kstrtoul(str, 0, &addr))
return -EINVAL;
- object = find_and_get_object(addr, 0);
- if (!object) {
+
+ found |= __dump_str_object_info(addr, 0);
+ found |= __dump_str_object_info(addr, OBJECT_PHYS);
+ found |= __dump_str_object_info(addr, OBJECT_PERCPU);
+
+ if (!found) {
pr_info("Unknown object at 0x%08lx\n", addr);
return -EINVAL;
}
- raw_spin_lock_irqsave(&object->lock, flags);
- dump_object_info(object);
- raw_spin_unlock_irqrestore(&object->lock, flags);
-
- put_object(object);
return 0;
}
@@ -2146,6 +2197,7 @@ static const struct file_operations kmemleak_fops = {
static void __kmemleak_do_cleanup(void)
{
struct kmemleak_object *object, *tmp;
+ unsigned int cnt = 0;
/*
* Kmemleak has already been disabled, no need for RCU list traversal
@@ -2154,6 +2206,10 @@ static void __kmemleak_do_cleanup(void)
list_for_each_entry_safe(object, tmp, &object_list, object_list) {
__remove_object(object);
__delete_object(object);
+
+ /* Call cond_resched() once per 64 iterations to avoid soft lockup */
+ if (!(++cnt & 0x3f))
+ cond_resched();
}
}
@@ -2241,7 +2297,7 @@ void __init kmemleak_init(void)
return;
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
- jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+ jiffies_scan_wait = secs_to_jiffies(SECS_SCAN_WAIT);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index a495debf1436..35ceaa8adb41 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
- /* Don't sleep. */
- flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
-
handle = stack_depot_save(entries, nr_entries, flags);
return stack_depot_set_extra_bits(handle, extra);
}
@@ -159,8 +156,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* Make sure we have enough spare bits in @id to hold the UAF bit and
* the chain depth.
*/
- BUILD_BUG_ON(
- (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+ BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <=
+ (KMSAN_MAX_ORIGIN_DEPTH << 1));
extra_bits = stack_depot_get_extra_bits(id);
depth = kmsan_depth_from_eb(extra_bits);
@@ -195,7 +192,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
u32 origin, bool checked)
{
u64 address = (u64)addr;
- u32 *shadow_start, *origin_start;
+ void *shadow_start;
+ u32 *aligned_shadow, *origin_start;
size_t pad = 0;
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
@@ -214,9 +212,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
}
__memset(shadow_start, b, size);
- if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ aligned_shadow = shadow_start;
+ } else {
pad = address % KMSAN_ORIGIN_SIZE;
address -= pad;
+ aligned_shadow = shadow_start - pad;
size += pad;
}
size = ALIGN(size, KMSAN_ORIGIN_SIZE);
@@ -230,7 +231,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
* corresponding shadow slot is zero.
*/
for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) {
- if (origin || !shadow_start[i])
+ if (origin || !aligned_shadow[i])
origin_start[i] = origin;
}
}
@@ -274,11 +275,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos - 1, user_addr,
reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -292,11 +291,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* poisoned bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -312,11 +309,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
*/
if (cur_origin != new_origin) {
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = new_origin;
cur_off_start = pos + i;
@@ -326,10 +321,8 @@ void kmsan_internal_check_memory(void *addr, size_t size,
}
KMSAN_WARN_ON(pos != size);
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3df45c25c1f6..92ebc0f557d0 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object)
if (s->ctor)
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
+ kmsan_internal_poison_memory(object, s->object_size,
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -114,9 +115,8 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr,
- page_size(page),
- GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page),
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -277,8 +277,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
* Don't check anything, just copy the shadow of the copied
* bytes.
*/
+ kmsan_enter_runtime();
kmsan_internal_memmove_metadata((void *)to, (void *)from,
to_copy - left);
+ kmsan_leave_runtime();
}
user_access_restore(ua_flags);
}
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 10f52c085e6c..b14ce3417e65 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end)
KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
KMSAN_WARN_ON((nstart >= nend) ||
/* Virtual address 0 is valid on s390. */
- (!IS_ENABLED(CONFIG_S390) && !nstart) ||
- !nend);
+ (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend);
nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
nend = ALIGN(nend, PAGE_SIZE);
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 02a405e55d6c..69f0a57a401c 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca);
void __msan_warning(u32 origin);
void __msan_warning(u32 origin)
{
- if (!kmsan_enabled || kmsan_in_runtime())
- return;
- kmsan_enter_runtime();
kmsan_report(origin, /*address*/ NULL, /*size*/ 0,
/*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL,
REASON_ANY);
- kmsan_leave_runtime();
}
EXPORT_SYMBOL(__msan_warning);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 29555a8bc315..bc3d1810f352 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void)
KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
}
-depot_stack_handle_t kmsan_save_stack(void);
depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
unsigned int extra_bits);
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 9733a22c46c1..902ec48b1e3e 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16)
DEFINE_TEST_MEMSETXX(32)
DEFINE_TEST_MEMSETXX(64)
+/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */
+static void test_memset_on_guarded_buffer(struct kunit *test)
+{
+ void *buf = vmalloc(PAGE_SIZE);
+
+ kunit_info(test,
+ "memset() on ends of guarded buffer should not crash\n");
+
+ for (size_t size = 0; size <= 128; size++) {
+ memset(buf, 0xff, size);
+ memset(buf + PAGE_SIZE - size, 0xff, size);
+ }
+ vfree(buf);
+}
+
static noinline void fibonacci(int *array, int size, int start)
{
if (start < 2 || (start == size))
@@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_memset16),
KUNIT_CASE(test_memset32),
KUNIT_CASE(test_memset64),
+ KUNIT_CASE(test_memset_on_guarded_buffer),
KUNIT_CASE(test_long_origin_chain),
KUNIT_CASE(test_stackdepot_roundtrip),
KUNIT_CASE(test_unpoison_memory),
@@ -732,3 +748,4 @@ kunit_test_suites(&kmsan_test_suite);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
+MODULE_DESCRIPTION("Test cases for KMSAN");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
index 94a3303fb65e..d6853ce08954 100644
--- a/mm/kmsan/report.c
+++ b/mm/kmsan/report.c
@@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
unsigned long ua_flags;
bool is_uaf;
- if (!kmsan_enabled)
+ if (!kmsan_enabled || kmsan_in_runtime())
return;
if (current->kmsan_ctx.depth)
return;
if (!origin)
return;
- kmsan_disable_current();
+ kmsan_enter_runtime();
ua_flags = user_access_save();
raw_spin_lock(&kmsan_report_lock);
pr_err("=====================================================\n");
@@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
if (panic_on_kmsan)
panic("kmsan.panic set ...\n");
user_access_restore(ua_flags);
- kmsan_enable_current();
+ kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 9c58f081d84f..55fdea199aaf 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -207,9 +207,8 @@ void kmsan_free_page(struct page *page, unsigned int order)
if (!kmsan_enabled || kmsan_in_runtime())
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(page_address(page),
- page_size(page),
- GFP_KERNEL,
+ kmsan_internal_poison_memory(page_address(page), page_size(page),
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
+ kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
- kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
flush_cache_vmap(shadow_start, shadow_end);
@@ -280,12 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
start = (void *)PAGE_ALIGN_DOWN((u64)start);
size = PAGE_ALIGN((u64)end - (u64)start);
- shadow = memblock_alloc(size, PAGE_SIZE);
- origin = memblock_alloc(size, PAGE_SIZE);
-
- if (!shadow || !origin)
- panic("%s: Failed to allocate metadata memory for early boot range of size %llu",
- __func__, size);
+ shadow = memblock_alloc_or_panic(size, PAGE_SIZE);
+ origin = memblock_alloc_or_panic(size, PAGE_SIZE);
for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
page = virt_to_page_or_null((char *)start + addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 31a9bc365437..16d41e2681ea 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -677,28 +677,32 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
-static bool vma_ksm_compatible(struct vm_area_struct *vma)
+static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags)
{
- if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
- VM_IO | VM_DONTEXPAND | VM_HUGETLB |
- VM_MIXEDMAP| VM_DROPPABLE))
+ if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL |
+ VM_HUGETLB | VM_DROPPABLE))
return false; /* just ignore the advice */
- if (vma_is_dax(vma))
+ if (file_is_dax(file))
return false;
#ifdef VM_SAO
- if (vma->vm_flags & VM_SAO)
+ if (vm_flags & VM_SAO)
return false;
#endif
#ifdef VM_SPARC_ADI
- if (vma->vm_flags & VM_SPARC_ADI)
+ if (vm_flags & VM_SPARC_ADI)
return false;
#endif
return true;
}
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+ return ksm_compatible(vma->vm_file, vma->vm_flags);
+}
+
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
unsigned long addr)
{
@@ -889,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
unsigned long kpfn;
expected_mapping = (void *)((unsigned long)stable_node |
- PAGE_MAPPING_KSM);
+ FOLIO_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
folio = pfn_folio(kpfn);
@@ -1066,7 +1070,7 @@ static inline void folio_set_stable_node(struct folio *folio,
struct ksm_stable_node *stable_node)
{
VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
- folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+ folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM);
}
#ifdef CONFIG_SYSFS
@@ -1270,8 +1274,15 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
- anon_exclusive = PageAnonExclusive(&folio->page);
entry = ptep_get(pvmw.pte);
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that actually
+ * map pages: give up just like the next folio_walk would.
+ */
+ if (unlikely(!pte_present(entry)))
+ goto out_unlock;
+
+ anon_exclusive = PageAnonExclusive(&folio->page);
if (pte_write(entry) || pte_dirty(entry) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
swapped = folio_test_swapcache(folio);
@@ -2447,6 +2458,95 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
+struct ksm_next_page_arg {
+ struct folio *folio;
+ struct page *page;
+ unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct ksm_next_page_arg *private = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *start_ptep = NULL, *ptep, pte;
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
+ struct page *page;
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ if (ksm_test_exit(mm))
+ return 0;
+
+ cond_resched();
+
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+ ptl = pmd_lock(mm, pmdp);
+ pmd = pmdp_get(pmdp);
+
+ if (!pmd_present(pmd)) {
+ goto not_found_unlock;
+ } else if (pmd_leaf(pmd)) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ goto not_found_unlock;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ goto not_found_unlock;
+
+ page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+ goto found_unlock;
+ }
+ spin_unlock(ptl);
+ }
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get(ptep);
+
+ if (!pte_present(pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ continue;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ continue;
+ goto found_unlock;
+ }
+
+not_found_unlock:
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ return 0;
+found_unlock:
+ folio_get(folio);
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ private->page = page;
+ private->folio = folio;
+ private->addr = addr;
+ return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+ .pmd_entry = ksm_next_page_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2534,21 +2634,27 @@ next_mm:
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
+ struct ksm_next_page_arg ksm_next_page_arg;
struct page *tmp_page = NULL;
- struct folio_walk fw;
struct folio *folio;
if (ksm_test_exit(mm))
break;
- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
- if (folio) {
- if (!folio_is_zone_device(folio) &&
- folio_test_anon(folio)) {
- folio_get(folio);
- tmp_page = fw.page;
- }
- folio_walk_end(&fw, vma);
+ int found;
+
+ found = walk_page_range_vma(vma, ksm_scan.address,
+ vma->vm_end,
+ &ksm_next_page_ops,
+ &ksm_next_page_arg);
+
+ if (found > 0) {
+ folio = ksm_next_page_arg.folio;
+ tmp_page = ksm_next_page_arg.page;
+ ksm_scan.address = ksm_next_page_arg.addr;
+ } else {
+ VM_WARN_ON_ONCE(found < 0);
+ ksm_scan.address = vma->vm_end - PAGE_SIZE;
}
if (tmp_page) {
@@ -2689,14 +2795,17 @@ static int ksm_scan_thread(void *nothing)
return 0;
}
-static void __ksm_add_vma(struct vm_area_struct *vma)
+static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags)
{
- unsigned long vm_flags = vma->vm_flags;
-
if (vm_flags & VM_MERGEABLE)
- return;
+ return false;
+
+ return ksm_compatible(file, vm_flags);
+}
- if (vma_ksm_compatible(vma))
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+ if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags))
vm_flags_set(vma, VM_MERGEABLE);
}
@@ -2717,16 +2826,22 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
}
/**
- * ksm_add_vma - Mark vma as mergeable if compatible
+ * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
*
- * @vma: Pointer to vma
+ * @mm: Proposed VMA's mm_struct
+ * @file: Proposed VMA's file-backed mapping, if any.
+ * @vm_flags: Proposed VMA"s flags.
+ *
+ * Returns: @vm_flags possibly updated to mark mergeable.
*/
-void ksm_add_vma(struct vm_area_struct *vma)
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+ vm_flags_t vm_flags)
{
- struct mm_struct *mm = vma->vm_mm;
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+ __ksm_should_add_vma(file, vm_flags))
+ vm_flags |= VM_MERGEABLE;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
- __ksm_add_vma(vma);
+ return vm_flags;
}
static void ksm_add_vmas(struct mm_struct *mm)
@@ -2820,7 +2935,7 @@ int ksm_disable(struct mm_struct *mm)
}
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, int advice, unsigned long *vm_flags)
+ unsigned long end, int advice, vm_flags_t *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
int err;
@@ -3262,6 +3377,25 @@ static void wait_while_offlining(void)
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_PROC_FS
+/*
+ * The process is mergeable only if any VMA is currently
+ * applicable to KSM.
+ *
+ * The mmap lock must be held in read mode.
+ */
+bool ksm_process_mergeable(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma)
+ if (vma->vm_flags & VM_MERGEABLE)
+ return true;
+
+ return false;
+}
+
long ksm_process_profit(struct mm_struct *mm)
{
return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
@@ -3643,10 +3777,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj,
{
const char *output;
- if (ksm_advisor == KSM_ADVISOR_NONE)
- output = "[none] scan-time";
- else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
output = "none [scan-time]";
+ else
+ output = "[none] scan-time";
return sysfs_emit(buf, "%s\n", output);
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7d69434c70e0..ec48b5dadf51 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -60,30 +60,34 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
return &lru->node[nid].lru;
}
+static inline bool lock_list_lru(struct list_lru_one *l, bool irq)
+{
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+ if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) {
+ if (irq)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+ return false;
+ }
+ return true;
+}
+
static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
bool irq, bool skip_empty)
{
struct list_lru_one *l;
- long nr_items;
rcu_read_lock();
again:
l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
- if (likely(l)) {
- if (irq)
- spin_lock_irq(&l->lock);
- else
- spin_lock(&l->lock);
- nr_items = READ_ONCE(l->nr_items);
- if (likely(nr_items != LONG_MIN)) {
- rcu_read_unlock();
- return l;
- }
- if (irq)
- spin_unlock_irq(&l->lock);
- else
- spin_unlock(&l->lock);
+ if (likely(l) && lock_list_lru(l, irq)) {
+ rcu_read_unlock();
+ return l;
}
/*
* Caller may simply bail out if raced with reparenting or
@@ -510,7 +514,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp)
{
unsigned long flags;
- struct list_lru_memcg *mlru;
+ struct list_lru_memcg *mlru = NULL;
struct mem_cgroup *pos, *parent;
XA_STATE(xas, &lru->xa, 0);
@@ -535,9 +539,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
parent = parent_mem_cgroup(pos);
}
- mlru = memcg_init_list_lru_one(lru, gfp);
- if (!mlru)
- return -ENOMEM;
+ if (!mlru) {
+ mlru = memcg_init_list_lru_one(lru, gfp);
+ if (!mlru)
+ return -ENOMEM;
+ }
xas_set(&xas, pos->kmemcg_id);
do {
xas_lock_irqsave(&xas, flags);
@@ -548,10 +554,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
}
xas_unlock_irqrestore(&xas, flags);
} while (xas_nomem(&xas, gfp));
- if (mlru)
- kfree(mlru);
} while (pos != memcg && !css_is_dying(&pos->css));
+ if (unlikely(mlru))
+ kfree(mlru);
+
return xas_error(&xas);
}
#else
diff --git a/mm/maccess.c b/mm/maccess.c
index 8f0906180a94..486559d68858 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -82,7 +82,6 @@ Efault:
pagefault_enable();
return -EFAULT;
}
-EXPORT_SYMBOL_GPL(copy_to_kernel_nofault);
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
@@ -196,7 +195,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
- } else if (ret > 0) {
+ } else if (ret >= 0) {
ret++;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index dcadd5b3457e..35ed4ab0d7c5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
@@ -37,6 +38,8 @@
#include "internal.h"
#include "swap.h"
+#define __MADV_SET_ANON_VMA_NAME (-1)
+
/*
* Maximum number of attempts we make to install guard pages before we give up
* and return -ERESTARTNOINTR to have userspace try again.
@@ -48,34 +51,39 @@ struct madvise_walk_private {
bool pageout;
};
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- case MADV_WILLNEED:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_COLD:
- case MADV_PAGEOUT:
- case MADV_FREE:
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- case MADV_COLLAPSE:
- case MADV_GUARD_INSTALL:
- case MADV_GUARD_REMOVE:
- return 0;
- default:
- /* be safe, default to 1. list exceptions explicitly */
- return 1;
- }
-}
+enum madvise_lock_mode {
+ MADVISE_NO_LOCK,
+ MADVISE_MMAP_READ_LOCK,
+ MADVISE_MMAP_WRITE_LOCK,
+ MADVISE_VMA_READ_LOCK,
+};
+
+struct madvise_behavior_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+struct madvise_behavior {
+ struct mm_struct *mm;
+ int behavior;
+ struct mmu_gather *tlb;
+ enum madvise_lock_mode lock_mode;
+ struct anon_vma_name *anon_name;
+
+ /*
+ * The range over which the behaviour is currently being applied. If
+ * traversing multiple VMAs, this is updated for each.
+ */
+ struct madvise_behavior_range range;
+ /* The VMA and VMA preceding it (if applicable) currently targeted. */
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ bool lock_dropped;
+};
#ifdef CONFIG_ANON_VMA_NAME
+static int madvise_walk_vmas(struct madvise_behavior *madv_behavior);
+
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
@@ -101,7 +109,8 @@ void anon_vma_name_free(struct kref *kref)
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
return vma->anon_name;
}
@@ -137,40 +146,39 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
- * Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary. Must be called with mmap_lock held for writing;
- * Caller should ensure anon_name stability by raising its refcount even when
- * anon_name belongs to a valid vma because this function might free that vma.
+ * Update the vm_flags or anon_name on region of a vma, splitting it or merging
+ * it as necessary. Must be called with mmap_lock held for writing.
*/
-static int madvise_update_vma(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long new_flags,
- struct anon_vma_name *anon_name)
+static int madvise_update_vma(vm_flags_t new_flags,
+ struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
- int error;
- VMA_ITERATOR(vmi, mm, start);
-
- if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ struct anon_vma_name *anon_name = madv_behavior->anon_name;
+ bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME;
+ VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
+
+ if (new_flags == vma->vm_flags && (!set_new_anon_name ||
+ anon_vma_name_eq(anon_vma_name(vma), anon_name)))
return 0;
- }
- vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
- anon_name);
+ if (set_new_anon_name)
+ vma = vma_modify_name(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, anon_name);
+ else
+ vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, new_flags);
+
if (IS_ERR(vma))
return PTR_ERR(vma);
- *prev = vma;
+ madv_behavior->vma = vma;
/* vm_flags is protected by the mmap_lock held in write mode. */
vma_start_write(vma);
vm_flags_reset(vma, new_flags);
- if (!vma->vm_file || vma_is_anon_shmem(vma)) {
- error = replace_anon_vma_name(vma, anon_name);
- if (error)
- return error;
- }
+ if (set_new_anon_name)
+ return replace_anon_vma_name(vma, anon_name);
return 0;
}
@@ -263,21 +271,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
}
#endif /* CONFIG_SWAP */
+static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior)
+{
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK);
+ madv_behavior->lock_dropped = true;
+}
+
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
-static long madvise_willneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_willneed(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct mm_struct *mm = madv_behavior->mm;
struct file *file = vma->vm_file;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
loff_t offset;
- *prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
@@ -303,7 +317,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ mark_mmap_lock_dropped(madv_behavior);
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
@@ -331,14 +345,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
struct folio *folio, pte_t *ptep,
- pte_t pte, bool *any_young,
- bool *any_dirty)
+ pte_t *ptentp)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
int max_nr = (end - addr) / PAGE_SIZE;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- any_young, any_dirty);
+ return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr,
+ FPB_MERGE_YOUNG_DIRTY);
}
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
@@ -387,7 +399,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -476,17 +488,11 @@ restart:
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, NULL);
- if (any_young)
- ptent = pte_mkyoung(ptent);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
@@ -503,6 +509,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -567,16 +574,19 @@ static const struct mm_walk_ops cold_walk_ops = {
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct madvise_behavior *madv_behavior)
+
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
@@ -585,28 +595,25 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}
-static long madvise_cold(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_cold(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
struct mmu_gather tlb;
- *prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_cold_page_range(&tlb, madv_behavior);
tlb_finish_mmu(&tlb);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct vm_area_struct *vma,
+ struct madvise_behavior_range *range)
{
struct madvise_walk_private walk_private = {
.pageout = true,
@@ -614,18 +621,16 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
-static long madvise_pageout(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_pageout(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
+ struct vm_area_struct *vma = madv_behavior->vma;
- *prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -640,8 +645,8 @@ static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_pageout_page_range(&tlb, vma, &madv_behavior->range);
tlb_finish_mmu(&tlb);
return 0;
@@ -713,15 +718,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young, any_dirty;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, &any_dirty);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (!folio_trylock(folio))
continue;
@@ -736,16 +737,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
continue;
}
-
- if (any_young)
- ptent = pte_mkyoung(ptent);
- if (any_dirty)
- ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
@@ -789,17 +786,31 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops madvise_free_walk_ops = {
- .pmd_entry = madvise_free_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+ switch (mode) {
+ case MADVISE_VMA_READ_LOCK:
+ return PGWALK_VMA_RDLOCK_VERIFY;
+ case MADVISE_MMAP_READ_LOCK:
+ return PGWALK_RDLOCK;
+ default:
+ /* Other modes don't require fixing up the walk_lock */
+ WARN_ON_ONCE(1);
+ return PGWALK_RDLOCK;
+ }
+}
-static int madvise_free_single_vma(struct vm_area_struct *vma,
- unsigned long start_addr, unsigned long end_addr)
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start_addr = madv_behavior->range.start;
+ unsigned long end_addr = madv_behavior->range.end;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -815,17 +826,15 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
- walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
+ walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
+ walk_page_range_vma(vma, range.start, range.end,
+ &walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -848,18 +857,28 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior)
+
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single_batched(
+ madv_behavior->tlb, madv_behavior->vma, range->start,
+ range->end - range->start, &details);
return 0;
}
-static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long *end,
- int behavior)
+static
+bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ int behavior = madv_behavior->behavior;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
if (!is_vm_hugetlb_page(vma)) {
unsigned int forbidden = VM_PFNMAP;
@@ -871,7 +890,7 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
return false;
- if (start & ~huge_page_mask(hstate_vma(vma)))
+ if (range->start & ~huge_page_mask(hstate_vma(vma)))
return false;
/*
@@ -880,40 +899,38 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
* Avoid unexpected data loss by rounding down the number of
* huge pages freed.
*/
- *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+ range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma)));
return true;
}
-static long madvise_dontneed_free(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ int behavior = madv_behavior->behavior;
- *prev = vma;
- if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (start == end)
+ if (range->start == range->end)
return 0;
- if (!userfaultfd_remove(vma, start, end)) {
- *prev = NULL; /* mmap_lock has been dropped, prev is stale */
+ if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) {
+ struct vm_area_struct *vma;
+ mark_mmap_lock_dropped(madv_behavior);
mmap_read_lock(mm);
- vma = vma_lookup(mm, start);
+ madv_behavior->vma = vma = vma_lookup(mm, range->start);
if (!vma)
return -ENOMEM;
/*
* Potential end adjustment for hugetlb vma is OK as
* the check below keeps end within vma.
*/
- if (!madvise_dontneed_free_valid_vma(vma, start, &end,
- behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (end > vma->vm_end) {
+ if (range->end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
* vma was split while the mmap_lock was
@@ -926,7 +943,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* end-vma->vm_end range, but the manager can
* handle a repetition fine.
*/
- end = vma->vm_end;
+ range->end = vma->vm_end;
}
/*
* If the memory region between start and end was
@@ -935,24 +952,26 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* the adjustment for hugetlb vma above may have rounded
* end down to the start address.
*/
- if (start == end)
+ if (range->start == range->end)
return 0;
- VM_WARN_ON(start > end);
+ VM_WARN_ON(range->start > range->end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(madv_behavior);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior);
else
return -EINVAL;
}
-static long madvise_populate(struct mm_struct *mm, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_populate(struct madvise_behavior *madv_behavior)
{
- const bool write = behavior == MADV_POPULATE_WRITE;
+ struct mm_struct *mm = madv_behavior->mm;
+ const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE;
int locked = 1;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
long pages;
while (start < end) {
@@ -989,16 +1008,17 @@ static long madvise_populate(struct mm_struct *mm, unsigned long start,
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*/
-static long madvise_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_remove(struct madvise_behavior *madv_behavior)
{
loff_t offset;
int error;
struct file *f;
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ mark_mmap_lock_dropped(madv_behavior);
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -1046,13 +1066,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
if (!allow_locked)
disallowed |= VM_LOCKED;
- if (!vma_is_anonymous(vma))
- return false;
-
- if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
- return false;
-
- return true;
+ return !(vma->vm_flags & disallowed);
}
static bool is_guard_pte_marker(pte_t ptent)
@@ -1067,7 +1081,7 @@ static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
pud_t pudval = pudp_get(pud);
/* If huge return >0 so we abort the operation + zap. */
- return pud_trans_huge(pudval) || pud_devmap(pudval);
+ return pud_trans_huge(pudval);
}
static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1076,7 +1090,7 @@ static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
pmd_t pmdval = pmdp_get(pmd);
/* If huge return >0 so we abort the operation + zap. */
- return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+ return pmd_trans_huge(pmdval);
}
static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
@@ -1116,14 +1130,13 @@ static const struct mm_walk_ops guard_install_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_install(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
long err;
int i;
- *prev = vma;
if (!is_valid_guard_vma(vma, /* allow_locked = */false))
return -EINVAL;
@@ -1154,13 +1167,14 @@ static long madvise_guard_install(struct vm_area_struct *vma,
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm(vma->vm_mm, start, end,
+ err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
&guard_install_walk_ops, &nr_pages);
if (err < 0)
return err;
if (err == 0) {
- unsigned long nr_expected_pages = PHYS_PFN(end - start);
+ unsigned long nr_expected_pages =
+ PHYS_PFN(range->end - range->start);
VM_WARN_ON(nr_pages != nr_expected_pages);
return 0;
@@ -1170,7 +1184,8 @@ static long madvise_guard_install(struct vm_area_struct *vma,
* OK some of the range have non-guard pages mapped, zap
* them. This leaves existing guard pages in place.
*/
- zap_page_range_single(vma, start, end - start, NULL);
+ zap_page_range_single(vma, range->start,
+ range->end - range->start, NULL);
}
/*
@@ -1187,7 +1202,7 @@ static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
pud_t pudval = pudp_get(pud);
/* If huge, cannot have guard pages present, so no-op - skip. */
- if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ if (pud_trans_huge(pudval))
walk->action = ACTION_CONTINUE;
return 0;
@@ -1199,7 +1214,7 @@ static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
pmd_t pmdval = pmdp_get(pmd);
/* If huge, cannot have guard pages present, so no-op - skip. */
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ if (pmd_trans_huge(pmdval))
walk->action = ACTION_CONTINUE;
return 0;
@@ -1226,11 +1241,11 @@ static const struct mm_walk_ops guard_remove_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
* non-destructive action.
@@ -1238,40 +1253,117 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
if (!is_valid_guard_vma(vma, /* allow_locked = */true))
return -EINVAL;
- return walk_page_range(vma->vm_mm, start, end,
+ return walk_page_range_vma(vma, range->start, range->end,
&guard_remove_walk_ops, NULL);
}
+#ifdef CONFIG_64BIT
+/* Does the madvise operation result in discarding of mapped data? */
+static bool is_discard(int behavior)
+{
+ switch (behavior) {
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_REMOVE:
+ case MADV_DONTFORK:
+ case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
+ * circumstances - discarding of data from read-only anonymous SEALED mappings.
+ *
+ * This is because users cannot trivally discard data from these VMAs, and may
+ * only do so via an appropriate madvise() call.
+ */
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ struct vm_area_struct *vma = madv_behavior->vma;
+
+ /* If the VMA isn't sealed we're good. */
+ if (!vma_is_sealed(vma))
+ return true;
+
+ /* For a sealed VMA, we only care about discard operations. */
+ if (!is_discard(madv_behavior->behavior))
+ return true;
+
+ /*
+ * We explicitly permit all file-backed mappings, whether MAP_SHARED or
+ * MAP_PRIVATE.
+ *
+ * The latter causes some complications. Because now, one can mmap()
+ * read/write a MAP_PRIVATE mapping, write to it, then mprotect()
+ * read-only, mseal() and a discard will be permitted.
+ *
+ * However, in order to avoid issues with potential use of madvise(...,
+ * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
+ * permit this.
+ */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /* If the user could write to the mapping anyway, then this is fine. */
+ if ((vma->vm_flags & VM_WRITE) &&
+ arch_vma_access_permitted(vma, /* write= */ true,
+ /* execute= */ false, /* foreign= */ false))
+ return true;
+
+ /* Otherwise, we are not permitted to perform this operation. */
+ return false;
+}
+#else
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ return true;
+}
+#endif
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
* behavior.
*/
-static int madvise_vma_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- unsigned long behavior)
+static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ vm_flags_t new_flags = vma->vm_flags;
+ struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- struct anon_vma_name *anon_name;
- unsigned long new_flags = vma->vm_flags;
- if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ if (unlikely(!can_madvise_modify(madv_behavior)))
return -EPERM;
switch (behavior) {
case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
+ return madvise_remove(madv_behavior);
case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
+ return madvise_willneed(madv_behavior);
case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
+ return madvise_cold(madv_behavior);
case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
+ return madvise_pageout(madv_behavior);
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(madv_behavior);
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, range->start, range->end,
+ &madv_behavior->lock_dropped);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(madv_behavior);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(madv_behavior);
+
+ /* The below behaviours update VMAs via madvise_update_vma(). */
+
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1285,18 +1377,18 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
- if (vma->vm_flags & VM_IO)
+ if (new_flags & VM_IO)
return -EINVAL;
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ if (vma->vm_file || new_flags & VM_SHARED)
return -EINVAL;
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
- if (vma->vm_flags & VM_DROPPABLE)
+ if (new_flags & VM_DROPPABLE)
return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
@@ -1304,14 +1396,15 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
- (vma->vm_flags & VM_DROPPABLE))
+ if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) ||
+ (new_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ error = ksm_madvise(vma, range->start, range->end,
+ behavior, &new_flags);
if (error)
goto out;
break;
@@ -1321,20 +1414,17 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
- case MADV_COLLAPSE:
- return madvise_collapse(vma, prev, start, end);
- case MADV_GUARD_INSTALL:
- return madvise_guard_install(vma, prev, start, end);
- case MADV_GUARD_REMOVE:
- return madvise_guard_remove(vma, prev, start, end);
+ case __MADV_SET_ANON_VMA_NAME:
+ /* Only anonymous mappings can be named */
+ if (vma->vm_file && !vma_is_anon_shmem(vma))
+ return -EBADF;
+ break;
}
- anon_name = anon_vma_name(vma);
- anon_vma_name_get(anon_name);
- error = madvise_update_vma(vma, prev, start, end, new_flags,
- anon_name);
- anon_vma_name_put(anon_name);
+ /* This is a write operation.*/
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
+ error = madvise_update_vma(new_flags, madv_behavior);
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1349,15 +1439,15 @@ out:
/*
* Error injection support for memory error handling.
*/
-static int madvise_inject_error(int behavior,
- unsigned long start, unsigned long end)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
{
unsigned long size;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-
for (; start < end; start += size) {
unsigned long pfn;
struct page *page;
@@ -1375,7 +1465,7 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (behavior == MADV_SOFT_OFFLINE) {
+ if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
@@ -1393,7 +1483,31 @@ static int madvise_inject_error(int behavior,
return 0;
}
-#endif
+
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
+{
+ switch (madv_behavior->behavior) {
+ case MADV_HWPOISON:
+ case MADV_SOFT_OFFLINE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#else
+
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
+{
+ return 0;
+}
+
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
+{
+ return false;
+}
+
+#endif /* CONFIG_MEMORY_FAILURE */
static bool
madvise_behavior_valid(int behavior)
@@ -1454,121 +1568,325 @@ static bool process_madvise_remote_valid(int behavior)
}
/*
- * Walk the vmas in range [start,end), and call the visit function on each one.
- * The visit function will get start and end parameters that cover the overlap
- * between the current vma and the original range. Any unmapped regions in the
- * original range will result in this function returning -ENOMEM while still
- * calling the visit function on all of the existing vmas in the range.
- * Must be called with the mmap_lock held for reading or writing.
+ * Try to acquire a VMA read lock if possible.
+ *
+ * We only support this lock over a single VMA, which the input range must
+ * span either partially or fully.
+ *
+ * This function always returns with an appropriate lock held. If a VMA read
+ * lock could be acquired, we return true and set madv_behavior state
+ * accordingly.
+ *
+ * If a VMA read lock could not be acquired, we return false and expect caller to
+ * fallback to mmap lock behaviour.
+ */
+static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
+{
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma;
+
+ vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
+ if (!vma)
+ goto take_mmap_read_lock;
+ /*
+ * Must span only a single VMA; uffd and remote processes are
+ * unsupported.
+ */
+ if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
+ userfaultfd_armed(vma)) {
+ vma_end_read(vma);
+ goto take_mmap_read_lock;
+ }
+ madv_behavior->vma = vma;
+ return true;
+
+take_mmap_read_lock:
+ mmap_read_lock(mm);
+ madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
+ return false;
+}
+
+/*
+ * Walk the vmas in range [start,end), and call the madvise_vma_behavior
+ * function on each one. The function will get start and end parameters that
+ * cover the overlap between the current vma and the original range. Any
+ * unmapped regions in the original range will result in this function returning
+ * -ENOMEM while still calling the madvise_vma_behavior function on all of the
+ * existing vmas in the range. Must be called with the mmap_lock held for
+ * reading or writing.
*/
static
-int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
- int (*visit)(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
{
- struct vm_area_struct *vma;
- struct vm_area_struct *prev;
- unsigned long tmp;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ /* range is updated to span each VMA, so store end of entire range. */
+ unsigned long last_end = range->end;
int unmapped_error = 0;
+ int error;
+ struct vm_area_struct *prev, *vma;
/*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- * - different from the way of handling in mlock etc.
+ * If VMA read lock is supported, apply madvise to a single VMA
+ * tentatively, avoiding walking VMAs.
*/
- vma = find_vma_prev(mm, start, &prev);
- if (vma && start > vma->vm_start)
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
+ try_vma_read_lock(madv_behavior)) {
+ error = madvise_vma_behavior(madv_behavior);
+ vma_end_read(madv_behavior->vma);
+ return error;
+ }
+
+ vma = find_vma_prev(mm, range->start, &prev);
+ if (vma && range->start > vma->vm_start)
prev = vma;
for (;;) {
- int error;
-
/* Still start < end. */
if (!vma)
return -ENOMEM;
- /* Here start < (end|vma->vm_end). */
- if (start < vma->vm_start) {
+ /* Here start < (last_end|vma->vm_end). */
+ if (range->start < vma->vm_start) {
+ /*
+ * This indicates a gap between VMAs in the input
+ * range. This does not cause the operation to abort,
+ * rather we simply return -ENOMEM to indicate that this
+ * has happened, but carry on.
+ */
unmapped_error = -ENOMEM;
- start = vma->vm_start;
- if (start >= end)
+ range->start = vma->vm_start;
+ if (range->start >= last_end)
break;
}
- /* Here vma->vm_start <= start < (end|vma->vm_end) */
- tmp = vma->vm_end;
- if (end < tmp)
- tmp = end;
+ /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */
+ range->end = min(vma->vm_end, last_end);
- /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = visit(vma, &prev, start, tmp, arg);
+ /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */
+ madv_behavior->prev = prev;
+ madv_behavior->vma = vma;
+ error = madvise_vma_behavior(madv_behavior);
if (error)
return error;
- start = tmp;
- if (prev && start < prev->vm_end)
- start = prev->vm_end;
- if (start >= end)
+ if (madv_behavior->lock_dropped) {
+ /* We dropped the mmap lock, we can't ref the VMA. */
+ prev = NULL;
+ vma = NULL;
+ madv_behavior->lock_dropped = false;
+ } else {
+ vma = madv_behavior->vma;
+ prev = vma;
+ }
+
+ if (vma && range->end < vma->vm_end)
+ range->end = vma->vm_end;
+ if (range->end >= last_end)
break;
- if (prev)
- vma = find_vma(mm, prev->vm_end);
- else /* madvise_remove dropped mmap_lock */
- vma = find_vma(mm, start);
+
+ vma = find_vma(mm, vma ? vma->vm_end : range->end);
+ range->start = range->end;
}
return unmapped_error;
}
-#ifdef CONFIG_ANON_VMA_NAME
-static int madvise_vma_anon_name(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- unsigned long anon_name)
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
{
- int error;
+ if (is_memory_failure(madv_behavior))
+ return MADVISE_NO_LOCK;
- /* Only anonymous mappings can be named */
- if (vma->vm_file && !vma_is_anon_shmem(vma))
- return -EBADF;
+ switch (madv_behavior->behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
+ return MADVISE_MMAP_READ_LOCK;
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return MADVISE_VMA_READ_LOCK;
+ default:
+ return MADVISE_MMAP_WRITE_LOCK;
+ }
+}
- error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+static int madvise_lock(struct madvise_behavior *madv_behavior)
+{
+ struct mm_struct *mm = madv_behavior->mm;
+ enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- return error;
+ switch (lock_mode) {
+ case MADVISE_NO_LOCK:
+ break;
+ case MADVISE_MMAP_WRITE_LOCK:
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ break;
+ case MADVISE_MMAP_READ_LOCK:
+ mmap_read_lock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
+
+ madv_behavior->lock_mode = lock_mode;
+ return 0;
}
-int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, struct anon_vma_name *anon_name)
+static void madvise_unlock(struct madvise_behavior *madv_behavior)
{
- unsigned long end;
- unsigned long len;
+ struct mm_struct *mm = madv_behavior->mm;
- if (start & ~PAGE_MASK)
- return -EINVAL;
- len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+ switch (madv_behavior->lock_mode) {
+ case MADVISE_NO_LOCK:
+ return;
+ case MADVISE_MMAP_WRITE_LOCK:
+ mmap_write_unlock(mm);
+ break;
+ case MADVISE_MMAP_READ_LOCK:
+ mmap_read_unlock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will drop the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
+
+ madv_behavior->lock_mode = MADVISE_NO_LOCK;
+}
+
+static bool madvise_batch_tlb_flush(int behavior)
+{
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm);
+}
+
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
+}
+
+static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+{
+ size_t len;
+
+ if (!madvise_behavior_valid(behavior))
+ return false;
+
+ if (!PAGE_ALIGNED(start))
+ return false;
+ len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return -EINVAL;
+ return false;
- end = start + len;
- if (end < start)
- return -EINVAL;
+ if (start + len < start)
+ return false;
- if (end == start)
- return 0;
+ return true;
+}
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
- madvise_vma_anon_name);
+/*
+ * madvise_should_skip() - Return if the request is invalid or nothing.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ * @behavior: Requested madvise behavor.
+ * @err: Pointer to store an error code from the check.
+ *
+ * If the specified behaviour is invalid or nothing would occur, we skip the
+ * operation. This function returns true in the cases, otherwise false. In
+ * the former case we store an error on @err.
+ */
+static bool madvise_should_skip(unsigned long start, size_t len_in,
+ int behavior, int *err)
+{
+ if (!is_valid_madvise(start, len_in, behavior)) {
+ *err = -EINVAL;
+ return true;
+ }
+ if (start + PAGE_ALIGN(len_in) == start) {
+ *err = 0;
+ return true;
+ }
+ return false;
}
-#endif /* CONFIG_ANON_VMA_NAME */
+
+static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
+{
+ switch (madv_behavior->behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * untagged_addr_remote() assumes mmap_lock is already held. On
+ * architectures like x86 and RISC-V, tagging is tricky because each
+ * mm may have a different tagging mask. However, we might only hold
+ * the per-VMA lock (currently only local processes are supported),
+ * so untagged_addr is used to avoid the mmap_lock assertion for
+ * local processes.
+ */
+static inline unsigned long get_untagged_addr(struct mm_struct *mm,
+ unsigned long start)
+{
+ return current->mm == mm ? untagged_addr(start) :
+ untagged_addr_remote(mm, start);
+}
+
+static int madvise_do_behavior(unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
+{
+ struct blk_plug plug;
+ int error;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
+ if (is_memory_failure(madv_behavior)) {
+ range->start = start;
+ range->end = start + len_in;
+ return madvise_inject_error(madv_behavior);
+ }
+
+ range->start = get_untagged_addr(madv_behavior->mm, start);
+ range->end = range->start + PAGE_ALIGN(len_in);
+
+ blk_start_plug(&plug);
+ if (is_madvise_populate(madv_behavior))
+ error = madvise_populate(madv_behavior);
+ else
+ error = madvise_walk_vmas(madv_behavior);
+ blk_finish_plug(&plug);
+ return error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1643,63 +1961,23 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
int error;
- int write;
- size_t len;
- struct blk_plug plug;
-
- if (!madvise_behavior_valid(behavior))
- return -EINVAL;
-
- if (!PAGE_ALIGNED(start))
- return -EINVAL;
- len = PAGE_ALIGN(len_in);
-
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
-
- end = start + len;
- if (end < start)
- return -EINVAL;
-
- if (end == start)
- return 0;
-
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_inject_error(behavior, start, start + len_in);
-#endif
-
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
- start = untagged_addr_remote(mm, start);
- end = start + len;
-
- blk_start_plug(&plug);
- switch (behavior) {
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- error = madvise_populate(mm, start, end, behavior);
- break;
- default:
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
- break;
- }
- blk_finish_plug(&plug);
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ return error;
+ error = madvise_lock(&madv_behavior);
+ if (error)
+ return error;
+ madvise_init_tlb(&madv_behavior);
+ error = madvise_do_behavior(start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
return error;
}
@@ -1715,19 +1993,36 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
ssize_t ret = 0;
size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
total_len = iov_iter_count(iter);
+ ret = madvise_lock(&madv_behavior);
+ if (ret)
+ return ret;
+ madvise_init_tlb(&madv_behavior);
+
while (iov_iter_count(iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
- iter_iov_len(iter), behavior);
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
+ size_t len_in = iter_iov_len(iter);
+ int error;
+
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ ret = error;
+ else
+ ret = madvise_do_behavior(start, len_in, &madv_behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
* the operation in aggregate, and would be surprising to the
* user.
*
- * As we have already dropped locks, it is safe to just loop and
+ * We drop and reacquire locks so it is safe to just loop and
* try again. We check for fatal signals in case we need exit
* early anyway.
*/
@@ -1736,13 +2031,24 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
ret = -EINTR;
break;
}
+
+ /* Drop and reacquire lock to unwind race. */
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+ ret = madvise_lock(&madv_behavior);
+ if (ret)
+ goto out;
+ madvise_init_tlb(&madv_behavior);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;
@@ -1811,3 +2117,88 @@ free_iov:
out:
return ret;
}
+
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, struct anon_vma_name *anon_name)
+{
+ unsigned long end;
+ unsigned long len;
+ int error;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = __MADV_SET_ANON_VMA_NAME,
+ .anon_name = anon_name,
+ };
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ madv_behavior.range.start = start;
+ madv_behavior.range.end = end;
+
+ error = madvise_lock(&madv_behavior);
+ if (error)
+ return error;
+ error = madvise_walk_vmas(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+
+ return error;
+}
+
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+ const char __user *uname)
+{
+ struct anon_vma_name *anon_name = NULL;
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ if (uname) {
+ char *name, *pch;
+
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+ }
+
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
+ anon_vma_name_put(anon_name);
+
+ return error;
+}
+#endif
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2f8829b3541a..c193de6cb23a 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -129,7 +129,7 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
pmd_t pmdval = pmdp_get_lockless(pmd);
/* Do not split a huge pmd, present or migrated */
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
+ if (pmd_trans_huge(pmdval)) {
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
walk->action = ACTION_CONTINUE;
}
@@ -152,7 +152,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
pud_t pudval = READ_ONCE(*pud);
/* Do not split a huge pud */
- if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
+ if (pud_trans_huge(pudval)) {
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
walk->action = ACTION_CONTINUE;
}
@@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk)
static int wp_clean_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
+ vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags);
/* Skip non-applicable VMAs */
if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
diff --git a/mm/memblock.c b/mm/memblock.c
index 095c18b5c430..117d963e677c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -16,6 +16,12 @@
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_KEXEC_HANDOVER
+#include <linux/libfdt.h>
+#include <linux/kexec_handover.h>
+#endif /* CONFIG_KEXEC_HANDOVER */
#include <asm/sections.h>
#include <linux/io.h>
@@ -106,6 +112,13 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */
+static bool kho_scratch_only;
+#else
+#define kho_scratch_only false
+#endif
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -165,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void)
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
+ /* skip non-scratch memory for kho early boot allocations */
+ if (kho_scratch_only)
+ return MEMBLOCK_KHO_SCRATCH;
+
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -456,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
- new_array = addr ? __va(addr) : NULL;
+ if (addr) {
+ /* The memory may not have been accepted, yet. */
+ accept_memory(addr, new_alloc_size);
+
+ new_array = __va(addr);
+ } else {
+ new_array = NULL;
+ }
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
@@ -491,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
* needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_alloc_size));
+ BUG_ON(memblock_reserve_kern(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -641,7 +665,7 @@ repeat:
#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
- WARN_ON(flags != rgn->flags);
+ WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags);
nr_new++;
if (insert) {
if (start_rgn == -1)
@@ -756,9 +780,9 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
}
if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
- mem_size_mb = memblock_phys_mem_size() >> 20;
+ mem_size_mb = memblock_phys_mem_size() / SZ_1M;
pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
- (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
+ (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb);
return false;
}
@@ -901,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size,
+ int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
- &base, &end, (void *)_RET_IP_);
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -923,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
}
#endif
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+__init void memblock_set_kho_scratch_only(void)
+{
+ kho_scratch_only = true;
+}
+
+__init void memblock_clear_kho_scratch_only(void)
+{
+ kho_scratch_only = false;
+}
+
+__init void memmap_init_kho_scratch_pages(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ int nid;
+ u64 i;
+
+ if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
+ return;
+
+ /*
+ * Initialize struct pages for free scratch memory.
+ * The struct pages for reserved scratch memory will be set up in
+ * reserve_bootmem_region()
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
+ for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
+ init_deferred_page(pfn, nid);
+ }
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @type: memblock type to set/clear flag for
@@ -1032,13 +1091,20 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
/**
* memblock_reserved_mark_noinit - Mark a reserved memory region with flag
- * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
- * for this region.
+ * MEMBLOCK_RSRV_NOINIT
+ *
* @base: the base phys addr of the region
* @size: the size of the region
*
- * struct pages will not be initialized for reserved memory regions marked with
- * %MEMBLOCK_RSRV_NOINIT.
+ * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will
+ * not be fully initialized to allow the caller optimize their initialization.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag
+ * completely bypasses the initialization of struct pages for such region.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this
+ * region will be initialized with default values but won't be marked as
+ * reserved.
*
* Return: 0 on success, -errno on failure.
*/
@@ -1048,6 +1114,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t
MEMBLOCK_RSRV_NOINIT);
}
+/**
+ * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered
+ * for allocations during early boot with kexec handover.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 1,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
+/**
+ * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a
+ * specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 0,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
static bool should_skip_region(struct memblock_type *type,
struct memblock_region *m,
int nid, int flags)
@@ -1079,6 +1175,13 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
return true;
+ /*
+ * In early alloc during kexec handover, we can only consider
+ * MEMBLOCK_KHO_SCRATCH regions for the allocations
+ */
+ if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m))
+ return true;
+
return false;
}
@@ -1459,14 +1562,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN))
goto done;
if (numa_valid_node(nid) && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_kern(found, size))
goto done;
}
@@ -1692,6 +1795,26 @@ void * __init memblock_alloc_try_nid(
}
/**
+ * __memblock_alloc_or_panic - Try to allocate memory and panic on failure
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @func: caller func name
+ *
+ * This function attempts to allocate memory using memblock_alloc,
+ * and in case of failure, it calls panic with the formatted message.
+ * This function should not be used directly, please use the macro memblock_alloc_or_panic.
+ */
+void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
+ const char *func)
+{
+ void *addr = memblock_alloc(size, align);
+
+ if (unlikely(!addr))
+ panic("%s: Failed to allocate %pap bytes\n", func, &size);
+ return addr;
+}
+
+/**
* memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
@@ -1731,6 +1854,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
+phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid)
+{
+ struct memblock_region *r;
+ phys_addr_t total = 0;
+
+ for_each_reserved_mem_region(r) {
+ phys_addr_t size = r->size;
+
+ if (r->base > limit)
+ break;
+
+ if (r->base + r->size > limit)
+ size = limit - r->base;
+
+ if (nid == memblock_get_region_node(r) || !numa_valid_node(nid))
+ if (r->flags & MEMBLOCK_RSRV_KERN)
+ total += size;
+ }
+
+ return total;
+}
+
/**
* memblock_estimated_nr_free_pages - return estimated number of free pages
* from memblock point of view
@@ -2144,8 +2289,10 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
if (start_pfn >= end_pfn)
return 0;
@@ -2160,11 +2307,14 @@ static void __init memmap_init_reserved_pages(void)
struct memblock_region *region;
phys_addr_t start, end;
int nid;
+ unsigned long max_reserved;
/*
* set nid on all reserved pages and also treat struct
* pages for the NOMAP regions as PageReserved
*/
+repeat:
+ max_reserved = memblock.reserved.max;
for_each_mem_region(region) {
nid = memblock_get_region_node(region);
start = region->base;
@@ -2173,8 +2323,15 @@ static void __init memmap_init_reserved_pages(void)
if (memblock_is_nomap(region))
reserve_bootmem_region(start, end, nid);
- memblock_set_node(start, end, &memblock.reserved, nid);
+ memblock_set_node(start, region->size, &memblock.reserved, nid);
}
+ /*
+ * 'max' is changed means memblock.reserved has been doubled its
+ * array, which may result a new reserved region before current
+ * 'start'. Now we should repeat the procedure to set its node id.
+ */
+ if (max_reserved != memblock.reserved.max)
+ goto repeat;
/*
* initialize struct pages for reserved regions that don't have
@@ -2249,6 +2406,7 @@ void __init memblock_free_all(void)
free_unused_memmap();
reset_all_zones_managed_pages();
+ memblock_clear_kho_scratch_only();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
}
@@ -2263,6 +2421,7 @@ struct reserve_mem_table {
};
static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
static int reserved_mem_count;
+static DEFINE_MUTEX(reserve_mem_lock);
/* Add wildcard region with a lookup name */
static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
@@ -2276,6 +2435,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
strscpy(map->name, name);
}
+static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name)
+{
+ struct reserve_mem_table *map;
+ int i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ map = &reserved_mem_table[i];
+ if (!map->size)
+ continue;
+ if (strcmp(name, map->name) == 0)
+ return map;
+ }
+ return NULL;
+}
+
/**
* reserve_mem_find_by_name - Find reserved memory region with a given name
* @name: The name that is attached to a reserved memory region
@@ -2289,21 +2463,229 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size)
{
struct reserve_mem_table *map;
- int i;
+
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ *start = map->start;
+ *size = map->size;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
+
+/**
+ * reserve_mem_release_by_name - Release reserved memory region with a given name
+ * @name: The name that is attatched to a reserved memory region
+ *
+ * Forcibly release the pages in the reserved memory region so that those memory
+ * can be used as free memory. After released the reserved region size becomes 0.
+ *
+ * Returns: 1 if released or 0 if not found.
+ */
+int reserve_mem_release_by_name(const char *name)
+{
+ char buf[RESERVE_MEM_NAME_SIZE + 12];
+ struct reserve_mem_table *map;
+ void *start, *end;
+
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ start = phys_to_virt(map->start);
+ end = start + map->size - 1;
+ snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
+ free_reserved_area(start, end, 0, buf);
+ map->size = 0;
+
+ return 1;
+}
+
+#ifdef CONFIG_KEXEC_HANDOVER
+#define MEMBLOCK_KHO_FDT "memblock"
+#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
+#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
+static struct page *kho_fdt;
+
+static int reserve_mem_kho_finalize(struct kho_serialization *ser)
+{
+ int err = 0, i;
for (i = 0; i < reserved_mem_count; i++) {
- map = &reserved_mem_table[i];
- if (!map->size)
- continue;
- if (strcmp(name, map->name) == 0) {
- *start = map->start;
- *size = map->size;
- return 1;
- }
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= kho_preserve_phys(map->start, map->size);
}
- return 0;
+
+ err |= kho_preserve_folio(page_folio(kho_fdt));
+ err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
+
+ return notifier_from_errno(err);
}
-EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
+
+static int reserve_mem_kho_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ switch (cmd) {
+ case KEXEC_KHO_FINALIZE:
+ return reserve_mem_kho_finalize((struct kho_serialization *)v);
+ case KEXEC_KHO_ABORT:
+ return NOTIFY_DONE;
+ default:
+ return NOTIFY_BAD;
+ }
+}
+
+static struct notifier_block reserve_mem_kho_nb = {
+ .notifier_call = reserve_mem_kho_notifier,
+};
+
+static int __init prepare_kho_fdt(void)
+{
+ int err = 0, i;
+ void *fdt;
+
+ kho_fdt = alloc_page(GFP_KERNEL);
+ if (!kho_fdt)
+ return -ENOMEM;
+
+ fdt = page_to_virt(kho_fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= fdt_begin_node(fdt, map->name);
+ err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
+ err |= fdt_property(fdt, "size", &map->size, sizeof(map->size));
+ err |= fdt_end_node(fdt);
+ }
+ err |= fdt_end_node(fdt);
+
+ err |= fdt_finish(fdt);
+
+ if (err) {
+ pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+
+static int __init reserve_mem_init(void)
+{
+ int err;
+
+ if (!kho_is_enabled() || !reserved_mem_count)
+ return 0;
+
+ err = prepare_kho_fdt();
+ if (err)
+ return err;
+
+ err = register_kho_notifier(&reserve_mem_kho_nb);
+ if (err) {
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+late_initcall(reserve_mem_init);
+
+static void *__init reserve_mem_kho_retrieve_fdt(void)
+{
+ phys_addr_t fdt_phys;
+ static void *fdt;
+ int err;
+
+ if (fdt)
+ return fdt;
+
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT)
+ pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
+ MEMBLOCK_KHO_FDT, err);
+ return NULL;
+ }
+
+ fdt = phys_to_virt(fdt_phys);
+
+ err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("FDT '%s' is incompatible with '%s': %d\n",
+ MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err);
+ fdt = NULL;
+ }
+
+ return fdt;
+}
+
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ int err, len_start, len_size, offset;
+ const phys_addr_t *p_start, *p_size;
+ const void *fdt;
+
+ fdt = reserve_mem_kho_retrieve_fdt();
+ if (!fdt)
+ return false;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0) {
+ pr_warn("FDT '%s' has no child '%s': %d\n",
+ MEMBLOCK_KHO_FDT, name, offset);
+ return false;
+ }
+ err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("Node '%s' is incompatible with '%s': %d\n",
+ name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err);
+ return false;
+ }
+
+ p_start = fdt_getprop(fdt, offset, "start", &len_start);
+ p_size = fdt_getprop(fdt, offset, "size", &len_size);
+ if (!p_start || len_start != sizeof(*p_start) || !p_size ||
+ len_size != sizeof(*p_size)) {
+ return false;
+ }
+
+ if (*p_start & (align - 1)) {
+ pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n",
+ name, (long)align, (long)*p_start);
+ return false;
+ }
+
+ if (*p_size != size) {
+ pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n",
+ name, (long)*p_size, (long)size);
+ return false;
+ }
+
+ reserved_mem_add(*p_start, size, name);
+ pr_info("Revived memory reservation '%s' from KHO\n", name);
+
+ return true;
+}
+#else
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER */
/*
* Parse reserve_mem=nn:align:name
@@ -2360,6 +2742,11 @@ static int __init reserve_mem(char *p)
if (reserve_mem_find_by_name(name, &start, &tmp))
return -EBUSY;
+ /* Pick previous allocations up from KHO if available */
+ if (reserve_mem_kho_revive(name, size, align))
+ return 1;
+
+ /* TODO: Allocation must be outside of scratch region */
start = memblock_phys_alloc(size, align);
if (!start)
return -ENOMEM;
@@ -2377,6 +2764,8 @@ static const char * const flagname[] = {
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
+ [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN",
+ [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH",
};
static int memblock_debug_show(struct seq_file *m, void *private)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index a071fa43d479..4b94731305b9 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -490,6 +490,19 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
}
/* Cgroup1: threshold notifications & softlimit tree updates */
+
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_NTARGETS,
+};
+
struct memcg1_events_percpu {
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
@@ -499,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
+ count_memcg_events(memcg, PGPGIN, 1);
else {
- __count_memcg_events(memcg, PGPGOUT, 1);
+ count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
@@ -568,8 +581,59 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
local_irq_restore(flags);
}
-void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
+/**
+ * memcg1_swapout - transfer a memsw charge to swap
+ * @folio: folio whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @folio to @entry.
+ */
+void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
+ struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!do_memsw_account())
+ return;
+
+ memcg = folio_memcg(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+ if (!memcg)
+ return;
+
+ /*
+ * In case the memcg owning these pages has been offlined and doesn't
+ * have an ID allocated to it anymore, charge the closest online
+ * ancestor for the swap instead and transfer the memory+swap charge.
+ */
+ swap_memcg = mem_cgroup_id_get_online(memcg);
+ nr_entries = folio_nr_pages(folio);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
+
+ folio_unqueue_deferred_split(folio);
+ folio->memcg_data = 0;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, nr_entries);
+
+ if (memcg != swap_memcg) {
+ if (!mem_cgroup_is_root(swap_memcg))
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
+ }
+
/*
* Interrupts should be disabled here because the caller holds the
* i_pages lock which is taken with interrupts-off. It is
@@ -581,6 +645,42 @@ void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
+
+ css_put(&memcg->css);
+}
+
+/*
+ * memcg1_swapin - uncharge swap slot
+ * @entry: the first swap entry for which the pages are charged
+ * @nr_pages: number of pages which will be uncharged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+{
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account()) {
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ mem_cgroup_uncharge_swap(entry, nr_pages);
+ }
}
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
@@ -589,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long flags;
local_irq_save(flags);
- __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ count_memcg_events(memcg, PGPGOUT, pgpgout);
__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
memcg1_check_events(memcg, nid);
local_irq_restore(flags);
@@ -899,7 +999,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
@@ -1040,13 +1140,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
} else if (!strcmp(name, "memory.oom_control")) {
pr_warn_once("oom_control is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org"
- " if you depend on this functionality. \n");
+ " if you depend on this functionality.\n");
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
pr_warn_once("pressure_level is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org "
- "if you depend on this functionality. \n");
+ "if you depend on this functionality.\n");
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
@@ -1134,8 +1234,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
- } else
- iter->oom_lock = true;
+ }
+ iter->oom_lock = true;
}
if (failed) {
@@ -1202,7 +1302,7 @@ struct oom_wait_info {
};
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
- unsigned mode, int sync, void *arg)
+ unsigned int mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
@@ -1644,7 +1744,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
unsigned long nr = 0;
enum lru_list lru;
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
+ VM_BUG_ON((unsigned int)nid >= nr_node_ids);
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
@@ -1855,9 +1955,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > MAX_SWAPPINESS)
return -EINVAL;
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
+ pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
+ "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
- else
+ } else
WRITE_ONCE(vm_swappiness, val);
return 0;
@@ -1881,7 +1983,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
pr_warn_once("oom_control is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org if you "
- "depend on this functionality. \n");
+ "depend on this functionality.\n");
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
@@ -2096,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg)
void memcg1_free_events(struct mem_cgroup *memcg)
{
- if (memcg->events_percpu)
- free_percpu(memcg->events_percpu);
+ free_percpu(memcg->events_percpu);
}
static int __init memcg1_init(void)
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 144d71b65907..6358464bb416 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -7,21 +7,6 @@
/* Cgroup v1 and v2 common declarations */
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages);
-
-static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
-{
- if (mem_cgroup_is_root(memcg))
- return 0;
-
- return try_charge_memcg(memcg, gfp_mask, nr_pages);
-}
-
-void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
-
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -37,38 +22,29 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-/* Whether legacy memory+swap accounting is active */
-static inline bool do_memsw_account(void)
-{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
-}
-
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremented by the number of pages. This counter is used
- * to trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_NTARGETS,
-};
-
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
void drain_all_stock(struct mem_cgroup *root_memcg);
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
-unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
-unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
-unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
int memory_stat_show(struct seq_file *m, void *v);
+void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg);
+
/* Cgroup v1-specific declarations */
#ifdef CONFIG_MEMCG_V1
+/* Whether legacy memory+swap accounting is active */
+static inline bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
+}
+
+unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
+unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
+unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
bool memcg1_alloc_events(struct mem_cgroup *memcg);
void memcg1_free_events(struct mem_cgroup *memcg);
@@ -96,7 +72,6 @@ void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
void memcg1_oom_recover(struct mem_cgroup *memcg);
void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
-void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg);
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
@@ -119,6 +94,7 @@ extern struct cftype mem_cgroup_legacy_files[];
#else /* CONFIG_MEMCG_V1 */
+static inline bool do_memsw_account(void) { return false; }
static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; }
static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
@@ -134,8 +110,6 @@ static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {}
static inline void memcg1_commit_charge(struct folio *folio,
struct mem_cgroup *memcg) {}
-static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {}
-
static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
unsigned long pgpgout,
unsigned long nr_memory, int nid) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f7c1d9f4f58d..2b9cc91eea11 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -50,7 +51,6 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/vmpressure.h>
#include <linux/memremap.h>
#include <linux/mm_inline.h>
@@ -95,6 +95,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -129,8 +132,7 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -163,8 +165,16 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ memcg1_account_kmem(memcg, -nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
+ }
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -315,6 +325,7 @@ static const unsigned int memcg_node_stat_items[] = {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+ PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
@@ -431,9 +442,11 @@ static const unsigned int memcg_vm_event_stat[] = {
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
+ PGSCAN_PROACTIVE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSTEAL_PROACTIVE,
PGFAULT,
PGMAJFAULT,
PGREFILL,
@@ -489,8 +502,8 @@ struct memcg_vmstats_percpu {
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
- struct memcg_vmstats_percpu *parent;
- struct memcg_vmstats *vmstats;
+ struct memcg_vmstats_percpu __percpu *parent_pcpu;
+ struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
@@ -517,7 +530,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -541,60 +554,41 @@ static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
-/*
- * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
- * not rely on this as part of an acquired spinlock_t lock. These functions are
- * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
- * is sufficient.
- */
-static void memcg_stats_lock(void)
-{
- preempt_disable_nested();
- VM_WARN_ON_IRQS_ENABLED();
-}
-
-static void __memcg_stats_lock(void)
-{
- preempt_disable_nested();
-}
-
-static void memcg_stats_unlock(void)
-{
- preempt_enable_nested();
-}
-
-
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+ int cpu)
{
+ struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- int cpu = smp_processor_id();
unsigned int stats_updates;
if (!val)
return;
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
- statc = this_cpu_ptr(memcg->vmstats_percpu);
- for (; statc; statc = statc->parent) {
- stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
- WRITE_ONCE(statc->stats_updates, stats_updates);
+ css_rstat_updated(&memcg->css, cpu);
+ statc_pcpu = memcg->vmstats_percpu;
+ for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
+ statc = this_cpu_ptr(statc_pcpu);
+ /*
+ * If @memcg is already flushable then all its ancestors are
+ * flushable as well and also there is no need to increase
+ * stats_updates.
+ */
+ if (memcg_vmstats_needs_flush(statc->vmstats))
+ break;
+
+ stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
+ abs(val));
if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
- /*
- * If @memcg is already flush-able, increasing stats_updates is
- * redundant. Avoid the overhead of the atomic update.
- */
- if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(stats_updates,
- &statc->vmstats->stats_updates);
- WRITE_ONCE(statc->stats_updates, 0);
+ stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -602,7 +596,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -611,7 +605,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -684,15 +678,16 @@ static int memcg_state_val_in_pages(int idx, int val)
}
/**
- * __mod_memcg_state - update cgroup memory statistics
+ * mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
int val)
{
int i = memcg_stats_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -700,12 +695,17 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
}
+#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
@@ -722,14 +722,16 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+#endif
-static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
+ int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -737,35 +739,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- /*
- * The caller from rmap relies on disabled preemption because they never
- * update their counter from in-interrupt context. For these two
- * counters we check that the update is never performed from an
- * interrupt context while other caller need to have disabled interrupt.
- */
- __memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- switch (idx) {
- case NR_ANON_MAPPED:
- case NR_FILE_MAPPED:
- case NR_ANON_THPS:
- WARN_ON_ONCE(!in_task());
- break;
- default:
- VM_WARN_ON_IRQS_ENABLED();
- }
- }
+ cpu = get_cpu();
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);
- memcg_stats_unlock();
+
+ put_cpu();
}
/**
@@ -786,7 +772,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
- __mod_memcg_lruvec_state(lruvec, idx, val);
+ mod_memcg_lruvec_state(lruvec, idx, val);
}
void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
@@ -836,15 +822,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
}
/**
- * __count_memcg_events - account VM events in a cgroup
+ * count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
int i = memcg_events_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -852,11 +839,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[i], count);
- memcg_rstat_updated(memcg, count);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->events[i], count);
+ memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);
- memcg_stats_unlock();
+
+ put_cpu();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -869,6 +858,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events[i]);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
@@ -878,6 +868,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events_local[i]);
}
+#endif
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
@@ -1161,7 +1152,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
- int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1171,10 +1161,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
- /* Avoid potential softlockup warning */
- if ((++i & 1023) == 0)
- cond_resched();
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
}
css_task_iter_end(&it);
if (ret) {
@@ -1390,6 +1379,7 @@ static const struct memory_stat memory_stats[] = {
{ "pgdemote_kswapd", PGDEMOTE_KSWAPD },
{ "pgdemote_direct", PGDEMOTE_DIRECT },
{ "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
+ { "pgdemote_proactive", PGDEMOTE_PROACTIVE },
#ifdef CONFIG_NUMA_BALANCING
{ "pgpromote_success", PGPROMOTE_SUCCESS },
#endif
@@ -1432,6 +1422,7 @@ static int memcg_page_state_output_unit(int item)
case PGDEMOTE_KSWAPD:
case PGDEMOTE_DIRECT:
case PGDEMOTE_KHUGEPAGED:
+ case PGDEMOTE_PROACTIVE:
#ifdef CONFIG_NUMA_BALANCING
case PGPROMOTE_SUCCESS:
#endif
@@ -1447,11 +1438,25 @@ unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
memcg_page_state_output_unit(item);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
{
return memcg_page_state_local(memcg, item) *
memcg_page_state_output_unit(item);
}
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static bool memcg_accounts_hugetlb(void)
+{
+ return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+}
+#else /* CONFIG_HUGETLB_PAGE */
+static bool memcg_accounts_hugetlb(void)
+{
+ return false;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
@@ -1474,7 +1479,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
#ifdef CONFIG_HUGETLB_PAGE
if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ !memcg_accounts_hugetlb())
continue;
#endif
size = memcg_page_state_output(memcg, memory_stats[i].idx);
@@ -1491,10 +1496,12 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_PROACTIVE) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_PROACTIVE) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
@@ -1554,16 +1561,23 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
+ unsigned long memory_failcnt;
lockdep_assert_held(&oom_lock);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]);
+ else
+ memory_failcnt = memcg->memory.failcnt;
+
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
- K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ K((u64)READ_ONCE(memcg->swap.max)),
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
#ifdef CONFIG_MEMCG_V1
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
@@ -1632,7 +1646,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = task_is_dying() || out_of_memory(&oc);
+ ret = out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1726,28 +1740,45 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
+#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
- local_lock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
+ local_trylock_t lock;
+ uint8_t nr_pages[NR_MEMCG_STOCK];
+ struct mem_cgroup *cached[NR_MEMCG_STOCK];
+
+ struct work_struct work;
+ unsigned long flags;
+};
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
+};
+
+struct obj_stock_pcp {
+ local_trylock_t lock;
+ unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
- unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+
+static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
};
+
static DEFINE_MUTEX(percpu_charge_mutex);
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static void drain_obj_stock(struct obj_stock_pcp *stock);
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
/**
@@ -1755,110 +1786,188 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
- * The charges will only happen if @memcg matches the current cpu's memcg
- * stock, and at least @nr_pages are available in that stock. Failure to
- * service an allocation will refill the stock.
+ * Consume the cached charge if enough nr_pages are present otherwise return
+ * failure. Also return failure for charge request larger than
+ * MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
- unsigned long flags;
+ uint8_t stock_pages;
bool ret = false;
+ int i;
- if (nr_pages > MEMCG_CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock))
return ret;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
-
stock = this_cpu_ptr(&memcg_stock);
- stock_pages = READ_ONCE(stock->nr_pages);
- if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
- ret = true;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ if (memcg != READ_ONCE(stock->cached[i]))
+ continue;
+
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
+ if (stock_pages >= nr_pages) {
+ WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+ ret = true;
+ }
+ break;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&memcg_stock.lock);
return ret;
}
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
- unsigned int stock_pages = READ_ONCE(stock->nr_pages);
- struct mem_cgroup *old = READ_ONCE(stock->cached);
+ struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+ uint8_t stock_pages;
if (!old)
return;
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
-
- WRITE_ONCE(stock->nr_pages, 0);
+ memcg_uncharge(old, stock_pages);
+ WRITE_ONCE(stock->nr_pages[i], 0);
}
css_put(&old->css);
- WRITE_ONCE(stock->cached, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i)
+ drain_stock(stock, i);
}
-static void drain_local_stock(struct work_struct *dummy)
+static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
- /*
- * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
- * drain_stock races is that we always operate on local CPU stock
- * here with IRQ disabled
- */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&memcg_stock.lock);
stock = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(stock);
- drain_stock(stock);
+ drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ local_unlock(&memcg_stock.lock);
}
-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_local_obj_stock(struct work_struct *dummy)
+{
+ struct obj_stock_pcp *stock;
+
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&obj_stock.lock);
+
+ stock = this_cpu_ptr(&obj_stock);
+ drain_obj_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+
+ local_unlock(&obj_stock.lock);
+}
+
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
+ struct mem_cgroup *cached;
+ uint8_t stock_pages;
+ bool success = false;
+ int empty_slot = -1;
+ int i;
+
+ /*
+ * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+ * decide to increase it more than 127 then we will need more careful
+ * handling of nr_pages[] in struct memcg_stock_pcp.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
+ VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
+
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock)) {
+ /*
+ * In case of larger than batch refill or unlikely failure to
+ * lock the percpu memcg_stock.lock, uncharge memcg directly.
+ */
+ memcg_uncharge(memcg, nr_pages);
+ return;
+ }
stock = this_cpu_ptr(&memcg_stock);
- if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
- drain_stock(stock);
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ cached = READ_ONCE(stock->cached[i]);
+ if (!cached && empty_slot == -1)
+ empty_slot = i;
+ if (memcg == READ_ONCE(stock->cached[i])) {
+ stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+ WRITE_ONCE(stock->nr_pages[i], stock_pages);
+ if (stock_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock, i);
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ i = empty_slot;
+ if (i == -1) {
+ i = get_random_u32_below(NR_MEMCG_STOCK);
+ drain_stock(stock, i);
+ }
css_get(&memcg->css);
- WRITE_ONCE(stock->cached, memcg);
+ WRITE_ONCE(stock->cached[i], memcg);
+ WRITE_ONCE(stock->nr_pages[i], nr_pages);
}
- stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
- WRITE_ONCE(stock->nr_pages, stock_pages);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ local_unlock(&memcg_stock.lock);
}
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
{
- unsigned long flags;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ int i;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- __refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ rcu_read_lock();
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ memcg = READ_ONCE(stock->cached[i]);
+ if (!memcg)
+ continue;
+
+ if (READ_ONCE(stock->nr_pages[i]) &&
+ mem_cgroup_is_descendant(memcg, root_memcg)) {
+ flush = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return flush;
}
/*
@@ -1881,25 +1990,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *memcg;
- bool flush = false;
+ struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
+ struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- rcu_read_lock();
- memcg = READ_ONCE(stock->cached);
- if (memcg && READ_ONCE(stock->nr_pages) &&
- mem_cgroup_is_descendant(memcg, root_memcg))
- flush = true;
- else if (obj_stock_flush_required(stock, root_memcg))
- flush = true;
- rcu_read_unlock();
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
+ is_memcg_drain_needed(memcg_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &memcg_st->flags)) {
+ if (cpu == curcpu)
+ drain_local_memcg_stock(&memcg_st->work);
+ else if (!cpu_is_isolated(cpu))
+ schedule_work_on(cpu, &memcg_st->work);
+ }
- if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
+ obj_stock_flush_required(obj_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &obj_st->flags)) {
if (cpu == curcpu)
- drain_local_stock(&stock->work);
+ drain_local_obj_stock(&obj_st->work);
else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &stock->work);
+ schedule_work_on(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -1908,10 +2019,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- struct memcg_stock_pcp *stock;
-
- stock = &per_cpu(memcg_stock, cpu);
- drain_stock(stock);
+ /* no need for the local lock */
+ drain_obj_stock(&per_cpu(obj_stock, cpu));
+ drain_stock_fully(&per_cpu(memcg_stock, cpu));
return 0;
}
@@ -2186,8 +2296,8 @@ out:
css_put(&memcg->css);
}
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
@@ -2199,11 +2309,16 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
+ bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
retry:
if (consume_stock(memcg, nr_pages))
return 0;
+ if (!allow_spinning)
+ /* Avoid the refill and flush of the older stock */
+ batch = nr_pages;
+
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2236,7 +2351,7 @@ retry:
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
raised_max_event = true;
psi_memstall_enter(&pflags);
@@ -2303,7 +2418,7 @@ force:
* a MEMCG_MAX event.
*/
if (!raised_max_event)
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
/*
* The allocation either can't fail or will lead to more memory
@@ -2376,19 +2491,13 @@ done_restock:
return 0;
}
-/**
- * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
- * @memcg: memcg previously charged.
- * @nr_pages: number of pages previously charged.
- */
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
- return;
+ return 0;
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
@@ -2404,29 +2513,48 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-/**
- * mem_cgroup_commit_charge - commit a previously successful try_charge().
- * @folio: folio to commit the charge to.
- * @memcg: memcg previously charged.
- */
-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
{
- css_get(&memcg->css);
- commit_charge(folio, memcg);
- memcg1_commit_charge(folio, memcg);
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* preemption is disabled in_nmi(). */
+ css_rstat_updated(&memcg->css, smp_processor_id());
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
-static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2551,6 +2679,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2613,6 +2744,24 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* preemption is disabled in_nmi(). */
+ css_rstat_updated(&memcg->css, smp_processor_id());
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2625,9 +2774,10 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ refill_stock(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2652,7 +2802,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -2660,6 +2810,23 @@ out:
return ret;
}
+static struct obj_cgroup *page_objcg(const struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ if (mem_cgroup_disabled() || !memcg_data)
+ return NULL;
+
+ VM_BUG_ON_PAGE((memcg_data & OBJEXTS_FLAGS_MASK) != MEMCG_DATA_KMEM,
+ page);
+ return (struct obj_cgroup *)(memcg_data - MEMCG_DATA_KMEM);
+}
+
+static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
+{
+ page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
+}
+
/**
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
@@ -2678,8 +2845,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
obj_cgroup_get(objcg);
- page->memcg_data = (unsigned long)objcg |
- MEMCG_DATA_KMEM;
+ page_set_objcg(page, objcg);
return 0;
}
}
@@ -2693,53 +2859,38 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct folio *folio = page_folio(page);
- struct obj_cgroup *objcg;
+ struct obj_cgroup *objcg = page_objcg(page);
unsigned int nr_pages = 1 << order;
- if (!folio_memcg_kmem(folio))
+ if (!objcg)
return;
- objcg = __folio_objcg(folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
- folio->memcg_data = 0;
+ page->memcg_data = 0;
obj_cgroup_put(objcg);
}
-static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static void __account_obj_stock(struct obj_cgroup *objcg,
+ struct obj_stock_pcp *stock, int nr,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
- * changes.
+ * accumulating over a page of vmstat data or when pgdat changes.
*/
- if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
- stock->cached_pgdat = pgdat;
- } else if (stock->cached_pgdat != pgdat) {
+ if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -2765,37 +2916,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- __mod_objcg_mlstate(objcg, pgdat, idx, nr);
-
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
}
-static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock))
+ return ret;
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
+
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&obj_stock.lock);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock_pcp *stock)
{
struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return NULL;
+ return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -2808,7 +2960,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- __refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2832,13 +2985,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -2847,67 +3000,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
WRITE_ONCE(stock->cached_objcg, NULL);
- /*
- * The `old' objects needs to be released by the caller via
- * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
- */
- return old;
+ obj_cgroup_put(old);
}
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
if (objcg) {
memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
+ flush = true;
}
+ rcu_read_unlock();
- return false;
+ return flush;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
+ enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock)) {
+ if (pgdat)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = drain_obj_stock(stock);
+ drain_obj_stock(stock);
obj_cgroup_get(objcg);
- WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
+
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
-
+ local_unlock(&obj_stock.lock);
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
-int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
unsigned int nr_pages, nr_bytes;
int ret;
- if (consume_obj_stock(objcg, size))
+ if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
return 0;
/*
@@ -2940,15 +3102,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
- if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
+ if (!ret && (nr_bytes || pgdat))
+ refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
+ false, size, pgdat, idx);
return ret;
}
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
+}
+
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size, true);
+ refill_obj_stock(objcg, size, true, 0, NULL, 0);
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -3000,23 +3168,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
}
- if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
- return false;
-
for (i = 0; i < size; i++) {
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
+ /*
+ * if we fail and size is 1, memcg_alloc_abort_single() will
+ * just free the object, which is ok as we have not assigned
+ * objcg to its obj_ext yet
+ *
+ * for larger sizes, kmem_cache_free_bulk() will uncharge
+ * any objects that were already charged and obj_ext assigned
+ *
+ * TODO: we could batch this until slab_pgdat(slab) changes
+ * between iterations, with a more complicated undo
+ */
+ if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
+ slab_pgdat(slab), cache_vmstat_idx(s)))
+ return false;
+
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
}
return true;
@@ -3025,6 +3202,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
{
+ size_t obj_size = obj_full_size(s);
+
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
@@ -3035,33 +3214,40 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
continue;
obj_exts[off].objcg = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
+ refill_obj_stock(objcg, obj_size, true, -obj_size,
+ slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
}
}
/*
- * Because folio_memcg(head) is not set on tails, set it now.
+ * The objcg is only set on the first page, so transfer it to all the
+ * other pages.
*/
-void split_page_memcg(struct page *head, int old_order, int new_order)
+void split_page_memcg(struct page *page, unsigned order)
{
- struct folio *folio = page_folio(head);
- int i;
- unsigned int old_nr = 1 << old_order;
- unsigned int new_nr = 1 << new_order;
+ struct obj_cgroup *objcg = page_objcg(page);
+ unsigned int i, nr = 1 << order;
- if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ if (!objcg)
return;
- for (i = new_nr; i < old_nr; i += new_nr)
- folio_page(folio, i)->memcg_data = folio->memcg_data;
+ for (i = 1; i < nr; i++)
+ page_set_objcg(&page[i], objcg);
- if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
- else
- css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
+ obj_cgroup_get_many(objcg, nr - 1);
+}
+
+void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
+ unsigned new_order)
+{
+ unsigned new_refs;
+
+ if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ return;
+
+ new_refs = (1 << (old_order - new_order)) - 1;
+ css_get_many(&__folio_memcg(folio)->css, new_refs);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3389,7 +3575,7 @@ void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
refcount_add(n, &memcg->id.ref);
}
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
@@ -3404,6 +3590,24 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
mem_cgroup_id_put_many(memcg, 1);
}
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+{
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
+ /*
+ * The root cgroup cannot be destroyed, so it's refcount must
+ * always be >= 1.
+ */
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
+ VM_BUG_ON(1);
+ break;
+ }
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ }
+ return memcg;
+}
+
/**
* mem_cgroup_from_id - look up a memcg from a memcg id
* @id: the memcg id to look up
@@ -3439,11 +3643,22 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
}
#endif
+static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn)
+{
+ if (!pn)
+ return;
+
+ free_percpu(pn->lruvec_stats_percpu);
+ kfree(pn->lruvec_stats);
+ kfree(pn);
+}
+
static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!pn)
return false;
@@ -3463,23 +3678,10 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
memcg->nodeinfo[node] = pn;
return true;
fail:
- kfree(pn->lruvec_stats);
- kfree(pn);
+ free_mem_cgroup_per_node_info(pn);
return false;
}
-static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
-{
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
-
- if (!pn)
- return;
-
- free_percpu(pn->lruvec_stats_percpu);
- kfree(pn->lruvec_stats);
- kfree(pn);
-}
-
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
@@ -3487,7 +3689,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
obj_cgroup_put(memcg->orig_objcg);
for_each_node(node)
- free_mem_cgroup_per_node_info(memcg, node);
+ free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3503,13 +3705,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
- struct memcg_vmstats_percpu *statc, *pstatc;
+ struct memcg_vmstats_percpu *statc;
+ struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
long error;
- memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+ memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg)
return ERR_PTR(-ENOMEM);
@@ -3534,9 +3737,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for_each_possible_cpu(cpu) {
if (parent)
- pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent = parent ? pstatc : NULL;
+ statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
}
@@ -3552,7 +3755,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
INIT_LIST_HEAD(&memcg->memory_peaks);
INIT_LIST_HEAD(&memcg->swap_peaks);
spin_lock_init(&memcg->peaks_lock);
- memcg->socket_pressure = jiffies;
+ memcg->socket_pressure = get_jiffies_64();
+#if BITS_PER_LONG < 64
+ seqlock_init(&memcg->socket_pressure_seqlock);
+#endif
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
INIT_LIST_HEAD(&memcg->objcg_list);
@@ -3580,6 +3786,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg, *old_memcg;
+ bool memcg_on_dfl = cgroup_subsys_on_dfl(memory_cgrp_subsys);
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc(parent);
@@ -3597,9 +3804,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
- page_counter_init(&memcg->memory, &parent->memory, true);
+ page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
+ memcg->memory.track_failcnt = !memcg_on_dfl;
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
page_counter_init(&memcg->kmem, &parent->kmem, false);
page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
@@ -3617,7 +3825,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ if (memcg_on_dfl && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
if (!cgroup_memory_nobpf)
@@ -3808,6 +4016,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3816,6 +4071,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -3865,8 +4122,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4019,7 +4276,7 @@ static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
WRITE_ONCE(peer_ctx->value, usage);
/* initial write, register watcher */
- if (ofp->value == -1)
+ if (ofp->value == OFP_PEAK_UNSET)
list_add(&ofp->list, watchers);
WRITE_ONCE(ofp->value, usage);
@@ -4107,6 +4364,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -4129,7 +4389,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (!reclaimed && !nr_retries--)
break;
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4156,6 +4416,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
xchg(&memcg->memory.max, max);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -4181,8 +4444,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
+ cond_resched();
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4303,78 +4567,15 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
-enum {
- MEMORY_RECLAIM_SWAPPINESS = 0,
- MEMORY_RECLAIM_NULL,
-};
-
-static const match_table_t tokens = {
- { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
- { MEMORY_RECLAIM_NULL, NULL },
-};
-
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MAX_RECLAIM_RETRIES;
- unsigned long nr_to_reclaim, nr_reclaimed = 0;
- int swappiness = -1;
- unsigned int reclaim_options;
- char *old_buf, *start;
- substring_t args[MAX_OPT_ARGS];
-
- buf = strstrip(buf);
-
- old_buf = buf;
- nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
- if (buf == old_buf)
- return -EINVAL;
-
- buf = strstrip(buf);
-
- while ((start = strsep(&buf, " ")) != NULL) {
- if (!strlen(start))
- continue;
- switch (match_token(start, tokens, args)) {
- case MEMORY_RECLAIM_SWAPPINESS:
- if (match_int(&args[0], &swappiness))
- return -EINVAL;
- if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- }
-
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
- while (nr_reclaimed < nr_to_reclaim) {
- /* Will converge on zero, but reclaim enforces a minimum */
- unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
- unsigned long reclaimed;
-
- if (signal_pending(current))
- return -EINTR;
-
- /*
- * This is the final attempt, drain percpu lru caches in the
- * hope of introducing more evictable pages for
- * try_to_free_mem_cgroup_pages().
- */
- if (!nr_retries)
- lru_add_drain_all();
-
- reclaimed = try_to_free_mem_cgroup_pages(memcg,
- batch_size, GFP_KERNEL,
- reclaim_options,
- swappiness == -1 ? NULL : &swappiness);
-
- if (!reclaimed && !nr_retries--)
- return -EAGAIN;
+ int ret;
- nr_reclaimed += reclaimed;
- }
+ ret = user_proactive_reclaim(buf, memcg, NULL);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -4503,7 +4704,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
if (ret)
goto out;
- mem_cgroup_commit_charge(folio, memcg);
+ css_get(&memcg->css);
+ commit_charge(folio, memcg);
+ memcg1_commit_charge(folio, memcg);
out:
return ret;
}
@@ -4521,38 +4724,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
- * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
- * @memcg: memcg to charge.
- * @gfp: reclaim mode.
- * @nr_pages: number of pages to charge.
- *
- * This function is called when allocating a huge page folio to determine if
- * the memcg has the capacity for it. It does not commit the charge yet,
- * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
+ * @folio: folio being charged
+ * @gfp: reclaim mode
*
- * Once we have obtained the hugetlb folio, we can call
- * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
- * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
- * of try_charge().
+ * This function is called when allocating a huge page folio, after the page has
+ * already been obtained and charged to the appropriate hugetlb cgroup
+ * controller (if it is enabled).
*
- * Returns 0 on success. Otherwise, an error code is returned.
+ * Returns ENOMEM if the memcg is already full.
+ * Returns 0 if either the charge was successful, or if we skip the charging.
*/
-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
- long nr_pages)
+int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_current();
+ int ret = 0;
+
/*
- * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
- * but do not attempt to commit charge later (or cancel on error) either.
+ * Even memcg does not account for hugetlb, we still want to update
+ * system-level stats via lruvec_stat_mod_folio. Return 0, and skip
+ * charging the memcg.
*/
- if (mem_cgroup_disabled() || !memcg ||
- !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
- return -EOPNOTSUPP;
+ if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() ||
+ !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ goto out;
- if (try_charge(memcg, gfp, nr_pages))
- return -ENOMEM;
+ if (charge_memcg(folio, memcg, gfp))
+ ret = -ENOMEM;
- return 0;
+out:
+ mem_cgroup_put(memcg);
+ return ret;
}
/**
@@ -4590,40 +4792,6 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
return ret;
}
-/*
- * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
- * @entry: the first swap entry for which the pages are charged
- * @nr_pages: number of pages which will be uncharged
- *
- * Call this function after successfully adding the charged page to swapcache.
- *
- * Note: This function assumes the page for which swap slot is being uncharged
- * is order 0 page.
- */
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
-{
- /*
- * Cgroup1's unified memory+swap counter has been charged with the
- * new swapcache page, finish the transfer by uncharging the swap
- * slot. The swap slot would also get uncharged when it dies, but
- * it can stick around indefinitely and we'd count the page twice
- * the entire time.
- *
- * Cgroup2 has separate resource counters for memory and swap,
- * so this is a non-issue here. Memory and swap charge lifetimes
- * correspond 1:1 to page and swap slot lifetimes: we charge the
- * page to memory here, and uncharge swap when the slot is freed.
- */
- if (!mem_cgroup_disabled() && do_memsw_account()) {
- /*
- * The swap entry might not get freed for a long time,
- * let's not wait for it. The page already received a
- * memory+swap charge, drop the swap entry duplicate.
- */
- mem_cgroup_uncharge_swap(entry, nr_pages);
- }
-}
-
struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_memory;
@@ -4640,9 +4808,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
- if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ memcg_uncharge(ug->memcg, ug->nr_memory);
if (ug->nr_kmem) {
mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
@@ -4859,6 +5025,19 @@ void mem_cgroup_sk_free(struct sock *sk)
css_put(&sk->sk_memcg->css);
}
+void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk)
+{
+ if (sk->sk_memcg == newsk->sk_memcg)
+ return;
+
+ mem_cgroup_sk_free(newsk);
+
+ if (sk->sk_memcg)
+ css_get(&sk->sk_memcg->css);
+
+ newsk->sk_memcg = sk->sk_memcg;
+}
+
/**
* mem_cgroup_charge_skmem - charge socket memory
* @memcg: memcg to charge
@@ -4874,7 +5053,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
- if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+ if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
}
@@ -4918,15 +5097,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory);
/*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
{
+ unsigned int memcg_size;
int cpu;
/*
@@ -4940,92 +5120,24 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
-
- return 0;
-}
-subsys_initcall(mem_cgroup_init);
-
-#ifdef CONFIG_SWAP
-static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
-{
- while (!refcount_inc_not_zero(&memcg->id.ref)) {
- /*
- * The root cgroup cannot be destroyed, so it's refcount must
- * always be >= 1.
- */
- if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
- VM_BUG_ON(1);
- break;
- }
- memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
+ drain_local_memcg_stock);
+ INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
+ drain_local_obj_stock);
}
- return memcg;
-}
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @folio: folio whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @folio to @entry.
- */
-void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
-{
- struct mem_cgroup *memcg, *swap_memcg;
- unsigned int nr_entries;
- unsigned short oldid;
+ memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+ memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
-
- if (mem_cgroup_disabled())
- return;
-
- if (!do_memsw_account())
- return;
-
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
- return;
-
- /*
- * In case the memcg owning these pages has been offlined and doesn't
- * have an ID allocated to it anymore, charge the closest online
- * ancestor for the swap instead and transfer the memory+swap charge.
- */
- swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = folio_nr_pages(folio);
- /* Get references for the tail pages, too */
- if (nr_entries > 1)
- mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
- nr_entries);
- VM_BUG_ON_FOLIO(oldid, folio);
- mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
-
- folio_unqueue_deferred_split(folio);
- folio->memcg_data = 0;
-
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, nr_entries);
-
- if (memcg != swap_memcg) {
- if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, nr_entries);
- page_counter_uncharge(&memcg->memsw, nr_entries);
- }
+ memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN);
- memcg1_swapout(folio, memcg);
- css_put(&memcg->css);
+ return 0;
}
+#ifdef CONFIG_SWAP
/**
* __mem_cgroup_try_charge_swap - try charging swap space for a folio
* @folio: folio being added to swap
@@ -5040,7 +5152,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
- unsigned short oldid;
if (do_memsw_account())
return 0;
@@ -5069,10 +5180,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
/* Get references for the tail pages, too */
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+
return 0;
}
@@ -5086,7 +5197,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- id = swap_cgroup_record(entry, 0, nr_pages);
+ id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
@@ -5478,3 +5589,8 @@ static int __init mem_cgroup_swap_init(void)
subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/memfd.c b/mm/memfd.c
index 35a370d75c9a..de3115b0dd09 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -20,6 +20,7 @@
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
+#include "swap.h"
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
@@ -31,8 +32,7 @@
static bool memfd_folio_has_extra_refs(struct folio *folio)
{
- return folio_ref_count(folio) - folio_mapcount(folio) !=
- folio_nr_pages(folio);
+ return folio_ref_count(folio) != folio_expected_ref_count(folio);
}
static void memfd_tag_pins(struct xa_state *xas)
@@ -70,7 +70,6 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
#ifdef CONFIG_HUGETLB_PAGE
struct folio *folio;
gfp_t gfp_mask;
- int err;
if (is_file_hugepages(memfd)) {
/*
@@ -79,28 +78,67 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
* alloc from. Also, the folio will be pinned for an indefinite
* amount of time, so it is not expected to be migrated away.
*/
+ struct inode *inode = file_inode(memfd);
struct hstate *h = hstate_file(memfd);
+ int err = -ENOMEM;
+ long nr_resv;
gfp_mask = htlb_alloc_mask(h);
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
idx >>= huge_page_order(h);
+ nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
+ if (nr_resv < 0)
+ return ERR_PTR(nr_resv);
+
folio = alloc_hugetlb_folio_reserve(h,
numa_node_id(),
NULL,
gfp_mask);
if (folio) {
+ u32 hash;
+
+ /*
+ * Zero the folio to prevent information leaks to userspace.
+ * Use folio_zero_user() which is optimized for huge/gigantic
+ * pages. Pass 0 as addr_hint since this is not a faulting path
+ * and we don't have a user virtual address yet.
+ */
+ folio_zero_user(folio, 0);
+
+ /*
+ * Mark the folio uptodate before adding to page cache,
+ * as required by filemap.c and other hugetlb paths.
+ */
+ __folio_mark_uptodate(folio);
+
+ /*
+ * Serialize hugepage allocation and instantiation to prevent
+ * races with concurrent allocations, as required by all other
+ * callers of hugetlb_add_to_page_cache().
+ */
+ hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
err = hugetlb_add_to_page_cache(folio,
memfd->f_mapping,
idx);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
if (err) {
folio_put(folio);
- return ERR_PTR(err);
+ goto err_unresv;
}
+
+ hugetlb_set_folio_subpool(folio, subpool_inode(inode));
folio_unlock(folio);
return folio;
}
- return ERR_PTR(-ENOMEM);
+err_unresv:
+ if (nr_resv > 0)
+ hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
+ return ERR_PTR(err);
}
#endif
return shmem_read_folio(memfd->f_mapping, idx);
@@ -170,7 +208,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
return error;
}
-unsigned int *memfd_file_seals_ptr(struct file *file)
+static unsigned int *memfd_file_seals_ptr(struct file *file)
{
if (shmem_file(file))
return &SHMEM_I(file_inode(file))->seals;
@@ -259,7 +297,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
/*
- * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
*/
if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
@@ -327,15 +365,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags)
return 0;
}
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
+static inline bool is_write_sealed(unsigned int seals)
{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
+ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+static int check_write_seal(vm_flags_t *vm_flags_ptr)
+{
+ vm_flags_t vm_flags = *vm_flags_ptr;
+ vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
+
+ /* If a private mapping then writability is irrelevant. */
+ if (!(mask & VM_SHARED))
+ return 0;
+
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * write seals are active.
+ */
+ if (mask & VM_WRITE)
+ return -EPERM;
+
+ /*
+ * This is a read-only mapping, disallow mprotect() from making a
+ * write-sealed mapping writable in future.
+ */
+ *vm_flags_ptr &= ~VM_MAYWRITE;
+
+ return 0;
+}
+
+int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
+{
+ int err = 0;
+ unsigned int *seals_ptr = memfd_file_seals_ptr(file);
+ unsigned int seals = seals_ptr ? *seals_ptr : 0;
+
+ if (is_write_sealed(seals))
+ err = check_write_seal(vm_flags_ptr);
+
+ return err;
+}
+
+static int sanitize_flags(unsigned int *flags_ptr)
+{
+ unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
@@ -351,50 +425,52 @@ SYSCALL_DEFINE2(memfd_create,
if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
return -EINVAL;
- error = check_sysctl_memfd_noexec(&flags);
- if (error < 0)
- return error;
+ return check_sysctl_memfd_noexec(flags_ptr);
+}
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
+static char *alloc_name(const char __user *uname)
+{
+ int error;
+ char *name;
+ long len;
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
+ /* returned length does not include terminating zero */
+ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
+ if (len < 0) {
error = -EFAULT;
goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
+ } else if (len > MFD_NAME_MAX_LEN) {
+ error = -EINVAL;
goto err_name;
}
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
+ return name;
+
+err_name:
+ kfree(name);
+ return ERR_PTR(error);
+}
+
+static struct file *alloc_file(const char *name, unsigned int flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
- } else
+ } else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
}
+ if (IS_ERR(file))
+ return file;
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
@@ -414,13 +490,44 @@ SYSCALL_DEFINE2(memfd_create,
*file_seals &= ~F_SEAL_SEAL;
}
+ return file;
+}
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ struct file *file;
+ int fd, error;
+ char *name;
+
+ error = sanitize_flags(&flags);
+ if (error < 0)
+ return error;
+
+ name = alloc_name(uname);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_free_name;
+ }
+
+ file = alloc_file(name, flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_free_fd;
+ }
+
fd_install(fd, file);
kfree(name);
return fd;
-err_fd:
+err_free_fd:
put_unused_fd(fd);
-err_name:
+err_free_name:
kfree(name);
return error;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 38bdffb45648..df6ee59527dd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};
-static struct ctl_table memory_failure_table[] = {
+static const struct ctl_table memory_failure_table[] = {
{
.procname = "memory_failure_early_kill",
.data = &sysctl_memory_failure_early_kill,
@@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
- if (pud_devmap(*pud))
+ if (pud_trans_huge(*pud))
return PUD_SHIFT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
- if (pmd_devmap(*pmd))
+ if (pmd_trans_huge(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
if (!pte)
return 0;
ptent = ptep_get(pte);
- if (pte_present(ptent) && pte_devmap(ptent))
+ if (pte_present(ptent))
ret = PAGE_SHIFT;
pte_unmap(pte);
return ret;
@@ -837,19 +837,33 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct mm_walk *walk)
{
struct hwpoison_walk *hwp = walk->private;
- pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t pte;
+ int ret;
- return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
- hwp->pfn, &hwp->tk);
+ ptl = huge_pte_lock(h, walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
+ ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hwpoison_hugetlb_range NULL
#endif
+static int hwpoison_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ /* We also want to consider pages mapped into VM_PFNMAP. */
+ return 0;
+}
+
static const struct mm_walk_ops hwpoison_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
+ .test_walk = hwpoison_test_walk,
.walk_lock = PGWALK_RDLOCK,
};
@@ -881,12 +895,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
(void *)&priv);
+ /*
+ * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
+ * succeeds or fails, then kill the process with SIGBUS.
+ * ret = 0 when poison page is a clean page and it's dropped, no
+ * SIGBUS is needed.
+ */
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
- else
- ret = 0;
mmap_read_unlock(p->mm);
- return ret > 0 ? -EHWPOISON : -EFAULT;
+
+ return ret > 0 ? -EHWPOISON : 0;
}
/*
@@ -937,7 +956,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
- [MF_MSG_ALREADY_POISONED] = "already poisoned",
+ [MF_MSG_ALREADY_POISONED] = "already poisoned page",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1330,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc(pfn);
-
- update_per_node_mf_stats(pfn, result);
+ if (type != MF_MSG_ALREADY_POISONED) {
+ num_poisoned_pages_inc(pfn);
+ update_per_node_mf_stats(pfn, result);
+ }
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
@@ -1383,8 +1403,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
if (PageSlab(page))
return false;
- /* Soft offline could migrate non-LRU movable pages */
- if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+ /* Soft offline could migrate movable_ops pages */
+ if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page))
return true;
return PageLRU(page) || is_free_buddy_page(page);
@@ -1556,6 +1576,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
+/*
+ * The caller must guarantee the folio isn't large folio, except hugetlb.
+ * try_to_unmap() can't handle it.
+ */
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
@@ -2071,12 +2095,11 @@ retry:
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
- pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
folio = page_folio(p);
res = kill_accessing_process(current, folio_pfn(folio), flags);
- action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
}
+ action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
return res;
} else if (res == -EBUSY) {
if (!(flags & MF_NO_RETRY)) {
@@ -2210,9 +2233,13 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks held.
*
- * Return: 0 for successfully handled the memory error,
- * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
- * < 0(except -EOPNOTSUPP) on failure.
+ * Return:
+ * 0 - success,
+ * -ENXIO - memory not managed by the kernel
+ * -EOPNOTSUPP - hwpoison_filter() filtered the error event,
+ * -EHWPOISON - the page was already poisoned, potentially
+ * kill process,
+ * other negative values - failure.
*/
int memory_failure(unsigned long pfn, int flags)
{
@@ -2258,7 +2285,6 @@ try_again:
goto unlock_mutex;
if (TestSetPageHWPoison(p)) {
- pr_err("%#lx: already hardware poisoned\n", pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
@@ -2494,19 +2520,6 @@ static void memory_failure_work_func(struct work_struct *work)
}
}
-/*
- * Process memory_failure work queued on the specified CPU.
- * Used to avoid return-to-userspace racing with the memory_failure workqueue.
- */
-void memory_failure_queue_kick(int cpu)
-{
- struct memory_failure_cpu *mf_cpu;
-
- mf_cpu = &per_cpu(memory_failure_cpu, cpu);
- cancel_work_sync(&mf_cpu->work);
- memory_failure_work_func(&mf_cpu->work);
-}
-
static int __init memory_failure_init(void)
{
struct memory_failure_cpu *mf_cpu;
@@ -2555,10 +2568,9 @@ int unpoison_memory(unsigned long pfn)
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- if (!pfn_valid(pfn))
- return -ENXIO;
-
- p = pfn_to_page(pfn);
+ p = pfn_to_online_page(pfn);
+ if (!p)
+ return -EIO;
folio = page_folio(p);
mutex_lock(&mf_mutex);
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index fc14fe53e9b7..0382b6942b8b 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -872,25 +872,18 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
- struct memory_notify *arg = _arg;
-
- /*
- * Only update the node migration order when a node is
- * changing status, like online->offline.
- */
- if (arg->status_change_nid < 0)
- return notifier_from_errno(0);
+ struct node_notify *nn = _arg;
switch (action) {
- case MEM_OFFLINE:
+ case NODE_REMOVED_LAST_MEMORY:
mutex_lock(&memory_tier_lock);
- if (clear_node_memory_tier(arg->status_change_nid))
+ if (clear_node_memory_tier(nn->nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
- case MEM_ONLINE:
+ case NODE_ADDED_FIRST_MEMORY:
mutex_lock(&memory_tier_lock);
- memtier = set_node_memory_tier(arg->status_change_nid);
+ memtier = set_node_memory_tier(nn->nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
@@ -929,7 +922,7 @@ static int __init memory_tier_init(void)
nodes_and(default_dram_nodes, node_states[N_MEMORY],
node_states[N_CPU]);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
diff --git a/mm/memory.c b/mm/memory.c
index b6015e230822..75f3b66be1d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,7 +57,6 @@
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
-#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
@@ -66,6 +65,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
@@ -94,14 +94,6 @@
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
-#ifndef CONFIG_NUMA
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
-
-struct page *mem_map;
-EXPORT_SYMBOL(mem_map);
-#endif
-
static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);
@@ -121,14 +113,6 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
}
/*
- * A number of key systems in x86 including ioremap() rely on the assumption
- * that high_memory defines the upper bound on direct map memory, then end
- * of ZONE_NORMAL.
- */
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-
-/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
@@ -141,6 +125,24 @@ int randomize_va_space __read_mostly =
2;
#endif
+static const struct ctl_table mmu_sysctl_table[] = {
+ {
+ .procname = "randomize_va_space",
+ .data = &randomize_va_space,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_mm_sysctl(void)
+{
+ register_sysctl_init("kernel", mmu_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_mm_sysctl);
+
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -294,8 +296,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
p4d_free_tlb(tlb, p4d, start);
}
-/*
- * This function frees user-level page tables of a process.
+/**
+ * free_pgd_range - Unmap and free page tables in the range
+ * @tlb: the mmu_gather containing pending TLB flush info
+ * @addr: virtual address start
+ * @end: virtual address end
+ * @floor: lowest address boundary
+ * @ceiling: highest address boundary
+ *
+ * This function tears down all user-level page tables in the
+ * specified virtual address range [@addr..@end). It is part of
+ * the memory unmap flow.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -365,6 +376,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -385,32 +398,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
vma_start_write(vma);
unlink_anon_vmas(vma);
- if (is_vm_hugetlb_page(vma)) {
- unlink_file_vma(vma);
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
- } else {
- unlink_file_vma_batch_init(&vb);
- unlink_file_vma_batch_add(&vb, vma);
+ unlink_file_vma_batch_init(&vb);
+ unlink_file_vma_batch_add(&vb, vma);
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = mas_find(mas, ceiling - 1);
- if (unlikely(xa_is_zero(next)))
- next = NULL;
- if (mm_wr_locked)
- vma_start_write(vma);
- unlink_anon_vmas(vma);
- unlink_file_vma_batch_add(&vb, vma);
- }
- unlink_file_vma_batch_final(&vb);
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+ vma = next;
+ next = mas_find(mas, ceiling - 1);
+ if (unlikely(xa_is_zero(next)))
+ next = NULL;
+ if (mm_wr_locked)
+ vma_start_write(vma);
+ unlink_anon_vmas(vma);
+ unlink_file_vma_batch_add(&vb, vma);
}
+ unlink_file_vma_batch_final(&vb);
+
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
vma = next;
} while (vma);
}
@@ -534,10 +541,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -602,16 +610,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (is_zero_pfn(pfn))
return NULL;
- if (pte_devmap(pte))
- /*
- * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
- * and will have refcounts incremented on their struct pages
- * when they are inserted into PTEs, thus they are safe to
- * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
- * do not have refcounts. Example of legacy ZONE_DEVICE is
- * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
- */
- return NULL;
print_bad_pte(vma, addr, pte, NULL);
return NULL;
@@ -689,9 +687,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
}
}
- if (pmd_devmap(pmd))
- return NULL;
- if (is_huge_zero_pmd(pmd))
+ if (is_huge_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
return NULL;
@@ -715,42 +711,53 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
}
#endif
+/**
+ * restore_exclusive_pte - Restore a device-exclusive entry
+ * @vma: VMA covering @address
+ * @folio: the mapped folio
+ * @page: the mapped folio page
+ * @address: the virtual address
+ * @ptep: pte pointer into the locked page table mapping the folio page
+ * @orig_pte: pte value at @ptep
+ *
+ * Restore a device-exclusive non-swap entry to an ordinary present pte.
+ *
+ * The folio and the page table must be locked, and MMU notifiers must have
+ * been called to invalidate any (exclusive) device mappings.
+ *
+ * Locking the folio makes sure that anybody who just converted the pte to
+ * a device-exclusive entry can map it into the device to make forward
+ * progress without others converting it back until the folio was unlocked.
+ *
+ * If the folio lock ever becomes an issue, we can stop relying on the folio
+ * lock; it might make some scenarios with heavy thrashing less likely to
+ * make forward progress, but these scenarios might not be valid use cases.
+ *
+ * Note that the folio lock does not protect against all cases of concurrent
+ * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
+ * must use MMU notifiers to sync against any concurrent changes.
+ */
static void restore_exclusive_pte(struct vm_area_struct *vma,
- struct page *page, unsigned long address,
- pte_t *ptep)
+ struct folio *folio, struct page *page, unsigned long address,
+ pte_t *ptep, pte_t orig_pte)
{
- struct folio *folio = page_folio(page);
- pte_t orig_pte;
pte_t pte;
- swp_entry_t entry;
- orig_pte = ptep_get(ptep);
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(orig_pte);
if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
- else if (is_writable_device_exclusive_entry(entry))
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-
- VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
- PageAnonExclusive(page)), folio);
-
- /*
- * No need to take a page reference as one was already
- * created when the swap entry was made.
- */
- if (folio_test_anon(folio))
- folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
- else
- /*
- * Currently device exclusive access only supports anonymous
- * memory so the entry shouldn't point to a filebacked page.
- */
- WARN_ON_ONCE(1);
+ if ((vma->vm_flags & VM_WRITE) &&
+ can_change_pte_writable(vma, address, pte)) {
+ if (folio_test_dirty(folio))
+ pte = pte_mkdirty(pte);
+ pte = pte_mkwrite(pte, vma);
+ }
set_pte_at(vma->vm_mm, address, ptep, pte);
/*
@@ -764,16 +771,15 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
* Tries to restore an exclusive pte if the page lock can be acquired without
* sleeping.
*/
-static int
-try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr)
+static int try_restore_exclusive_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
- swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+ struct folio *folio = page_folio(page);
- if (trylock_page(page)) {
- restore_exclusive_pte(vma, page, addr, src_pte);
- unlock_page(page);
+ if (folio_trylock(folio)) {
+ restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
+ folio_unlock(folio);
return 0;
}
@@ -791,7 +797,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = dst_vma->vm_flags;
+ vm_flags_t vm_flags = dst_vma->vm_flags;
pte_t orig_pte = ptep_get(src_pte);
pte_t pte = orig_pte;
struct folio *folio;
@@ -853,7 +859,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
folio_get(folio);
rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
- folio_try_dup_anon_rmap_pte(folio, page, src_vma);
+ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
/*
* We do not preserve soft-dirty information, because so
@@ -879,7 +885,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* (ie. COW) mappings.
*/
VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
- if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
@@ -935,7 +941,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
+ pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
@@ -979,10 +985,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
int max_nr, int *rss, struct folio **prealloc)
{
+ fpb_t flags = FPB_MERGE_WRITE;
struct page *page;
struct folio *folio;
- bool any_writable;
- fpb_t flags = 0;
int err, nr;
page = vm_normal_page(src_vma, addr, pte);
@@ -997,28 +1002,25 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* by keeping the batching logic separate.
*/
if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
- if (src_vma->vm_flags & VM_SHARED)
- flags |= FPB_IGNORE_DIRTY;
- if (!vma_soft_dirty_enabled(src_vma))
- flags |= FPB_IGNORE_SOFT_DIRTY;
+ if (!(src_vma->vm_flags & VM_SHARED))
+ flags |= FPB_RESPECT_DIRTY;
+ if (vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_RESPECT_SOFT_DIRTY;
- nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable, NULL, NULL);
+ nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
- nr, src_vma))) {
+ nr, dst_vma, src_vma))) {
folio_ref_sub(folio, nr);
return -EAGAIN;
}
rss[MM_ANONPAGES] += nr;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_ptes(folio, page, nr);
+ folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
- if (any_writable)
- pte = pte_mkwrite(pte, src_vma);
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
addr, nr);
return nr;
@@ -1032,7 +1034,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
@@ -1042,7 +1044,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_pte(folio, page);
+ folio_dup_file_rmap_pte(folio, page, dst_vma);
rss[mm_counter_file(folio)]++;
}
@@ -1244,8 +1246,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
- || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -1281,7 +1282,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ if (pud_trans_huge(*src_pud)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
@@ -1362,12 +1363,12 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
- unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
+ unsigned long next;
bool is_cow;
int ret;
@@ -1377,16 +1378,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
- if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- /*
- * We do not free on error cases below as remove_vma
- * gets called on error from higher level routine
- */
- ret = track_pfn_copy(src_vma);
- if (ret)
- return ret;
- }
-
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
@@ -1419,7 +1410,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
- untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1436,7 +1426,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
- if (!details)
+ if (!details || details->reclaim_pt)
return true;
/* Or, we zap COWed pages only if the caller wants to */
@@ -1466,34 +1456,42 @@ static inline bool zap_drop_markers(struct zap_details *details)
/*
* This function makes sure that we'll replace the none pte with an uffd-wp
* swap special pte marker when necessary. Must be with the pgtable lock held.
+ *
+ * Returns true if uffd-wp ptes was installed, false otherwise.
*/
-static inline void
+static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
+ bool was_installed = false;
+
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
- return;
+ return false;
if (zap_drop_markers(details))
- return;
+ return false;
for (;;) {
/* the PFN in the PTE is irrelevant. */
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
+ was_installed = true;
if (--nr == 0)
break;
pte++;
addr += PAGE_SIZE;
}
+#endif
+ return was_installed;
}
static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct folio *folio,
struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
unsigned long addr, struct zap_details *details, int *rss,
- bool *force_flush, bool *force_break)
+ bool *force_flush, bool *force_break, bool *any_skipped)
{
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
@@ -1519,8 +1517,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entries(tlb, pte, nr, addr);
if (unlikely(userfaultfd_pte_wp(vma, ptent)))
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
- ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
+ nr, details, ptent);
if (!delay_rmap) {
folio_remove_rmap_ptes(folio, page, nr, vma);
@@ -1544,9 +1542,8 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *force_flush,
- bool *force_break)
+ bool *force_break, bool *any_skipped)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
struct folio *folio;
struct page *page;
@@ -1559,34 +1556,140 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
if (userfaultfd_pte_wp(vma, ptent))
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
- details, ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
+ pte, 1, details, ptent);
ksm_might_unmap_zero_page(mm, ptent);
return 1;
}
folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
+ if (unlikely(!should_zap_folio(details, folio))) {
+ *any_skipped = true;
return 1;
+ }
/*
* Make sure that the common "small folio" case is as fast as possible
* by keeping the batching logic separate.
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL, NULL, NULL);
-
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
- force_break);
+ force_break, any_skipped);
return nr;
}
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
- details, rss, force_flush, force_break);
+ details, rss, force_flush, force_break, any_skipped);
return 1;
}
+static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *any_skipped)
+{
+ swp_entry_t entry;
+ int nr = 1;
+
+ *any_skipped = true;
+ entry = pte_to_swp_entry(ptent);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(page);
+
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
+ rss[mm_counter(folio)]--;
+ folio_remove_rmap_pte(folio, page, vma);
+ folio_put(folio);
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entries, hence a private anon pages */
+ if (!should_zap_cows(details))
+ return 1;
+
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ } else if (is_migration_entry(entry)) {
+ struct folio *folio = pfn_swap_entry_folio(entry);
+
+ if (!should_zap_folio(details, folio))
+ return 1;
+ rss[mm_counter(folio)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
+ return 1;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
+ return 1;
+ } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ if (!should_zap_cows(details))
+ return 1;
+ } else {
+ /* We should have covered all the swap entry types */
+ pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ WARN_ON_ONCE(1);
+ }
+ clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+
+ return nr;
+}
+
+static inline int do_zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break,
+ bool *any_skipped)
+{
+ pte_t ptent = ptep_get(pte);
+ int max_nr = (end - addr) / PAGE_SIZE;
+ int nr = 0;
+
+ /* Skip all consecutive none ptes */
+ if (pte_none(ptent)) {
+ for (nr = 1; nr < max_nr; nr++) {
+ ptent = ptep_get(pte + nr);
+ if (!pte_none(ptent))
+ break;
+ }
+ max_nr -= nr;
+ if (!max_nr)
+ return nr;
+ pte += nr;
+ addr += nr * PAGE_SIZE;
+ }
+
+ if (pte_present(ptent))
+ nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, force_flush, force_break,
+ any_skipped);
+ else
+ nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, any_skipped);
+
+ return nr;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1598,9 +1701,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+ pmd_t pmdval;
+ unsigned long start = addr;
+ bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
+ bool direct_reclaim = true;
int nr;
+retry:
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1610,90 +1717,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = ptep_get(pte);
- struct folio *folio;
- struct page *page;
- int max_nr;
-
- nr = 1;
- if (pte_none(ptent))
- continue;
+ bool any_skipped = false;
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
-
- if (pte_present(ptent)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
- addr, details, rss, &force_flush,
- &force_break);
- if (unlikely(force_break)) {
- addr += nr * PAGE_SIZE;
- break;
- }
- continue;
}
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
- continue;
- /*
- * Both device private/exclusive mappings should only
- * work with anonymous page so far, so we don't need to
- * consider uffd-wp bit when zap. For more information,
- * see zap_install_uffd_wp_if_needed().
- */
- WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
- folio_put(folio);
- } else if (!non_swap_entry(entry)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = swap_pte_batch(pte, max_nr, ptent);
- /* Genuine swap entries, hence a private anon pages */
- if (!should_zap_cows(details))
- continue;
- rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
- if (!should_zap_folio(details, folio))
- continue;
- rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
- /*
- * For anon: always drop the marker; for file: only
- * drop the marker if explicitly requested.
- */
- if (!vma_is_anonymous(vma) &&
- !zap_drop_markers(details))
- continue;
- } else if (is_guard_swp_entry(entry)) {
- /*
- * Ordinary zapping should not remove guard PTE
- * markers. Only do so if we should remove PTE markers
- * in general.
- */
- if (!zap_drop_markers(details))
- continue;
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
- if (!should_zap_cows(details))
- continue;
- } else {
- /* We should have covered all the swap entry types */
- pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
- WARN_ON_ONCE(1);
+ nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
+ &force_flush, &force_break, &any_skipped);
+ if (any_skipped)
+ can_reclaim_pt = false;
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
+ break;
}
- clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
+ direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1713,6 +1765,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (force_flush)
tlb_flush_mmu(tlb);
+ if (addr != end) {
+ cond_resched();
+ force_flush = false;
+ force_break = false;
+ goto retry;
+ }
+
+ if (can_reclaim_pt) {
+ if (direct_reclaim)
+ free_pte(mm, start, tlb, pmdval);
+ else
+ try_to_free_pte(mm, pmd, start, tlb);
+ }
+
return addr;
}
@@ -1727,9 +1793,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
addr = next;
continue;
@@ -1769,7 +1835,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (pud_trans_huge(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
@@ -1844,9 +1910,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0, mm_wr_locked);
-
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
@@ -1920,36 +1983,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * zap_page_range_single_batched - remove user pages in a given range
+ * @tlb: pointer to the caller's struct mmu_gather
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
+ * @address: starting address of pages to remove
+ * @size: number of bytes to remove
* @details: details of shared cache invalidation
*
- * The range must fit into one VMA.
+ * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
+ * hugetlb, @tlb is flushed and re-initialized by this function.
*/
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
const unsigned long end = address + size;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
- lru_add_drain();
+ VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
+
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
- tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
/*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
+ if (is_vm_hugetlb_page(vma)) {
+ /*
+ * flush tlb and free resources before hugetlb_zap_end(), to
+ * avoid concurrent page faults' allocation failure.
+ */
+ tlb_finish_mmu(tlb);
+ hugetlb_zap_end(vma, details);
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ }
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ zap_page_range_single_batched(&tlb, vma, address, size, details);
tlb_finish_mmu(&tlb);
- hugetlb_zap_end(vma, details);
}
/**
@@ -2056,19 +2147,39 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
}
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
- unsigned long addr, struct page *page, pgprot_t prot)
+ unsigned long addr, struct page *page,
+ pgprot_t prot, bool mkwrite)
{
struct folio *folio = page_folio(page);
- pte_t pteval;
+ pte_t pteval = ptep_get(pte);
+
+ if (!pte_none(pteval)) {
+ if (!mkwrite)
+ return -EBUSY;
+
+ /* see insert_pfn(). */
+ if (pte_pfn(pteval) != page_to_pfn(page)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
+ return -EFAULT;
+ }
+ pteval = maybe_mkwrite(pteval, vma);
+ pteval = pte_mkyoung(pteval);
+ if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
+ update_mmu_cache(vma, addr, pte);
+ return 0;
+ }
- if (!pte_none(ptep_get(pte)))
- return -EBUSY;
/* Ok, finally just insert the thing.. */
pteval = mk_pte(page, prot);
if (unlikely(is_zero_folio(folio))) {
pteval = pte_mkspecial(pteval);
} else {
folio_get(folio);
+ pteval = mk_pte(page, prot);
+ if (mkwrite) {
+ pteval = pte_mkyoung(pteval);
+ pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
+ }
inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
}
@@ -2077,7 +2188,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
}
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, pgprot_t prot)
+ struct page *page, pgprot_t prot, bool mkwrite)
{
int retval;
pte_t *pte;
@@ -2090,7 +2201,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
+ mkwrite);
pte_unmap_unlock(pte, ptl);
out:
return retval;
@@ -2104,7 +2216,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
err = validate_page_before_insert(vma, page);
if (err)
return err;
- return insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -2240,7 +2352,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
BUG_ON(vma->vm_flags & VM_PFNMAP);
vm_flags_set(vma, VM_MIXEDMAP);
}
- return insert_page(vma, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2328,7 +2440,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot, bool mkwrite)
+ unsigned long pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
@@ -2350,7 +2462,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ if (pte_pfn(entry) != pfn) {
WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
@@ -2363,10 +2475,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
/* Ok, finally just insert the thing.. */
- if (pfn_t_devmap(pfn))
- entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
- else
- entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
if (mkwrite) {
entry = pte_mkyoung(entry);
@@ -2435,10 +2544,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
- false);
+ return insert_pfn(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
@@ -2469,25 +2577,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vmf_insert_pfn);
-static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
+static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
+ bool mkwrite)
{
- if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
+ if (unlikely(is_zero_pfn(pfn)) &&
(mkwrite || !vm_mixed_zeropage_allowed(vma)))
return false;
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
- if (pfn_t_devmap(pfn))
- return true;
- if (pfn_t_special(pfn))
- return true;
- if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+ if (is_zero_pfn(pfn))
return true;
return false;
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+ unsigned long addr, unsigned long pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
int err;
@@ -2498,9 +2603,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
/*
@@ -2510,8 +2615,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
- !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
struct page *page;
/*
@@ -2519,8 +2623,8 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* regardless of whether the caller specified flags that
* result in pfn_t_has_page() == false.
*/
- page = pfn_to_page(pfn_t_to_pfn(pfn));
- err = insert_page(vma, addr, page, pgprot);
+ page = pfn_to_page(pfn);
+ err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
@@ -2533,8 +2637,28 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write)
+{
+ pgprot_t pgprot = vmf->vma->vm_page_prot;
+ unsigned long addr = vmf->address;
+ int err;
+
+ if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ err = insert_page(vmf->vma, addr, page, pgprot, write);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+ unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
}
@@ -2546,7 +2670,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
* the same entry was actually inserted.
*/
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
+ unsigned long addr, unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, true);
}
@@ -2723,6 +2847,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+ unsigned long size, pgprot_t *prot)
+{
+ struct pfnmap_track_ctx *ctx;
+
+ if (pfnmap_track(pfn, size, prot))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (unlikely(!ctx)) {
+ pfnmap_untrack(pfn, size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ctx->pfn = pfn;
+ ctx->size = size;
+ kref_init(&ctx->kref);
+ return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+ struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+ pfnmap_untrack(ctx->pfn, ctx->size);
+ kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -2735,20 +2889,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
*
* Return: %0 on success, negative error code otherwise.
*/
+#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
+ struct pfnmap_track_ctx *ctx = NULL;
int err;
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
- if (err)
+ size = PAGE_ALIGN(size);
+
+ /*
+ * If we cover the full VMA, we'll perform actual tracking, and
+ * remember to untrack when the last reference to our tracking
+ * context from a VMA goes away. We'll keep tracking the whole pfn
+ * range even during VMA splits and partial unmapping.
+ *
+ * If we only cover parts of the VMA, we'll only setup the cachemode
+ * in the pgprot for the pfn range.
+ */
+ if (addr == vma->vm_start && addr + size == vma->vm_end) {
+ if (vma->pfnmap_track_ctx)
+ return -EINVAL;
+ ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
return -EINVAL;
+ }
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
- if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
+ if (ctx) {
+ if (err)
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ else
+ vma->pfnmap_track_ctx = ctx;
+ }
return err;
}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+}
+#endif
EXPORT_SYMBOL(remap_pfn_range);
/**
@@ -2828,11 +3013,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -3015,7 +3200,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
-EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
@@ -3414,7 +3598,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(&new_folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(new_folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3598,19 +3782,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
return ret;
}
-static bool wp_can_reuse_anon_folio(struct folio *folio,
- struct vm_area_struct *vma)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
{
+ bool exclusive = false;
+
+ /* Let's just free up a large folio if only a single page is mapped. */
+ if (folio_large_mapcount(folio) <= 1)
+ return false;
+
/*
- * We could currently only reuse a subpage of a large folio if no
- * other subpages of the large folios are still mapped. However,
- * let's just consistently not reuse subpages even if we could
- * reuse in that scenario, and give back a large folio a bit
- * sooner.
+ * The assumption for anonymous folios is that each page can only get
+ * mapped once into each MM. The only exception are KSM folios, which
+ * are always small.
+ *
+ * Each taken mapcount must be paired with exactly one taken reference,
+ * whereby the refcount must be incremented before the mapcount when
+ * mapping a page, and the refcount must be decremented after the
+ * mapcount when unmapping a page.
+ *
+ * If all folio references are from mappings, and all mappings are in
+ * the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
+ return false;
+
+ VM_WARN_ON_ONCE(folio_test_ksm(folio));
+
+ if (unlikely(folio_test_swapcache(folio))) {
+ /*
+ * Note: freeing up the swapcache will fail if some PTEs are
+ * still swap entries.
+ */
+ if (!folio_trylock(folio))
+ return false;
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
return false;
+ /* Stabilize the mapcount vs. refcount and recheck. */
+ folio_lock_large_mapcount(folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
+
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
+ goto unlock;
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
+ goto unlock;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
+ folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
+
+ /*
+ * Do we need the folio lock? Likely not. If there would have been
+ * references from page migration/swapout, we would have detected
+ * an additional folio reference and never ended up here.
+ */
+ exclusive = true;
+unlock:
+ folio_unlock_large_mapcount(folio);
+ return exclusive;
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static bool wp_can_reuse_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
+ return __wp_can_reuse_large_anon_folio(folio, vma);
+
/*
* We have to verify under folio lock: these early checks are
* just an optimization to avoid locking the folio and freeing
@@ -3719,13 +3970,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
+ * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if (!vmf->page)
+ if (!vmf->page || is_fsdax_page(vmf->page)) {
+ vmf->page = NULL;
return wp_pfn_shared(vmf);
+ }
return wp_page_shared(vmf, folio);
}
@@ -3915,7 +4168,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
folio_put(folio);
return ret;
}
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
mmu_notifier_invalidate_range_start(&range);
@@ -3923,7 +4176,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
- restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
+ restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
+ vmf->pte, vmf->orig_pte);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4045,26 +4299,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
@@ -4191,8 +4425,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
return folio;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
folio_put(folio);
}
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
order = next_order(&orders, order);
}
@@ -4269,10 +4505,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ struct dev_pagemap *pgmap;
+
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pgmap = page_pgmap(vmf->page);
+ ret = pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
@@ -4326,7 +4570,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
need_clear_cache = true;
- mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+ memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -4390,8 +4634,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
@@ -4824,7 +5068,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*/
__folio_mark_uptodate(folio);
- entry = mk_pte(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry), vma);
@@ -4949,9 +5193,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -4999,7 +5242,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
flush_icache_pages(vma, page, HPAGE_PMD_NR);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -5024,7 +5267,7 @@ out:
return ret;
}
#else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
return VM_FAULT_FALLBACK;
}
@@ -5056,6 +5299,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_write(entry) && folio_test_dirty(folio))
+ entry = pte_mkdirty(entry);
if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
@@ -5116,6 +5361,7 @@ fallback:
else
page = vmf->page;
+ folio = page_folio(page);
/*
* check even for read faults because we might have lost our CoWed
* page
@@ -5126,9 +5372,26 @@ fallback:
return ret;
}
+ if (!needs_fallback && vma->vm_file) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t file_end;
+
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+
+ /*
+ * Do not allow to map with PTEs beyond i_size and with PMD
+ * across i_size to preserve SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ needs_fallback = !shmem_mapping(mapping) &&
+ file_end < folio_next_index(folio);
+ }
+
if (pmd_none(*vmf->pmd)) {
- if (PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
+ if (!needs_fallback && folio_test_pmd_mappable(folio)) {
+ ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
@@ -5139,15 +5402,14 @@ fallback:
return VM_FAULT_OOM;
}
- folio = page_folio(page);
nr_pages = folio_nr_pages(folio);
/*
* Using per-page fault to maintain the uffd semantics, and same
- * approach also applies to non-anonymous-shmem faults to avoid
+ * approach also applies to non shmem/tmpfs faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
@@ -5495,7 +5757,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
* Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
*flags |= TNF_SHARED;
/*
* For memory tiering mode, cpupid of slow memory page is used
@@ -5632,7 +5894,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
ignore_writable = true;
/* Migrate to the requested node */
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
nid = target_nid;
flags |= TNF_MIGRATED;
task_numa_fault(last_cpupid, nid, nr_pages, flags);
@@ -5703,7 +5965,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
split:
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -5867,7 +6129,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -5891,7 +6153,7 @@ retry_pud:
pud_t orig_pud = *vmf.pud;
barrier();
- if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+ if (pud_trans_huge(orig_pud)) {
/*
* TODO once we support anonymous PUDs: NUMA case and
@@ -5932,7 +6194,7 @@ retry_pud:
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_trans_huge(vmf.orig_pmd)) {
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
@@ -6076,7 +6338,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
}
/*
- * By the time we get here, we already hold the mm semaphore
+ * By the time we get here, we already hold either the VMA lock or the
+ * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
@@ -6148,173 +6411,6 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- if (likely(mmap_read_trylock(mm)))
- return true;
-
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
-
- return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalend to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
-
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
-
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
-
- if (expand_stack_locked(vma, addr))
- goto fail;
-
-success:
- mmap_write_downgrade(mm);
- return vma;
-
-fail:
- mmap_write_unlock(mm);
- return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
-
- rcu_read_lock();
-retry:
- vma = mas_walk(&mas);
- if (!vma)
- goto inval;
-
- if (!vma_start_read(vma))
- goto inval;
-
- /* Check if the VMA got isolated after we found it */
- if (vma->detached) {
- vma_end_read(vma);
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
-
- /* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
- rcu_read_unlock();
- return vma;
-
-inval_end_read:
- vma_end_read(vma);
-inval:
- rcu_read_unlock();
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
@@ -6395,6 +6491,7 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
args->lock = lock;
args->ptep = ptep;
args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+ args->addr_mask = addr_mask;
args->pgprot = pgprot;
args->writable = writable;
args->special = special;
@@ -6554,7 +6651,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
- unsigned long prot = 0;
+ pgprot_t prot = __pgprot(0);
void __iomem *maddr;
int offset = offset_in_page(addr);
int ret = -EINVAL;
@@ -6564,7 +6661,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pfnmap_start(&args))
return -EINVAL;
- prot = pgprot_val(args.pgprot);
+ prot = args.pgprot;
phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
writable = args.writable;
follow_pfnmap_end(&args);
@@ -6579,7 +6676,7 @@ retry:
if (follow_pfnmap_start(&args))
goto out_unmap;
- if ((prot != pgprot_val(args.pgprot)) ||
+ if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
(phys_addr != (args.pfn << PAGE_SHIFT)) ||
(writable != args.writable)) {
follow_pfnmap_end(&args);
@@ -6624,6 +6721,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset;
void *maddr;
+ struct folio *folio;
struct vm_area_struct *vma = NULL;
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
@@ -6655,21 +6753,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
if (bytes <= 0)
break;
} else {
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
+ folio_mark_dirty_lock(folio);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
len -= bytes;
buf += bytes;
@@ -6721,6 +6820,126 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ void *old_buf = buf;
+ int err = 0;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return -EFAULT;
+
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ while (len) {
+ int bytes, offset, retval;
+ void *maddr;
+ struct folio *folio;
+ struct page *page;
+ struct vm_area_struct *vma = NULL;
+
+ page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
+ if (IS_ERR(page)) {
+ /*
+ * Treat as a total failure for now until we decide how
+ * to handle the CONFIG_HAVE_IOREMAP_PROT case and
+ * stack expansion.
+ */
+ *(char *)buf = '\0';
+ err = -EFAULT;
+ goto out;
+ }
+
+ folio = page_folio(page);
+ bytes = len;
+ offset = addr & (PAGE_SIZE - 1);
+ if (bytes > PAGE_SIZE - offset)
+ bytes = PAGE_SIZE - offset;
+
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
+ retval = strscpy(buf, maddr + offset, bytes);
+ if (retval >= 0) {
+ /* Found the end of the string */
+ buf += retval;
+ folio_release_kmap(folio, maddr);
+ break;
+ }
+
+ buf += bytes - 1;
+ /*
+ * Because strscpy always NUL terminates we need to
+ * copy the last byte in the page if we are going to
+ * load more pages
+ */
+ if (bytes != len) {
+ addr += bytes - 1;
+ copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
+ buf += 1;
+ addr += 1;
+ }
+ len -= bytes;
+
+ folio_release_kmap(folio, maddr);
+ }
+
+out:
+ mmap_read_unlock(mm);
+ if (err)
+ return err;
+ return buf - old_buf;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/*
* Print the name of a VMA.
*/
@@ -6753,10 +6972,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
@@ -6968,7 +7185,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
+ if (ptdesc->ptl)
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c3de35389269..74318c787715 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
#include <linux/compaction.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/node.h>
#include <asm/tlbflush.h>
@@ -219,11 +220,30 @@ void put_online_mems(void)
bool movable_node_enabled = false;
-#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int mhp_default_online_type = MMOP_OFFLINE;
-#else
-int mhp_default_online_type = MMOP_ONLINE;
-#endif
+static int mhp_default_online_type = -1;
+int mhp_get_default_online_type(void)
+{
+ if (mhp_default_online_type >= 0)
+ return mhp_default_online_type;
+
+ if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE))
+ mhp_default_online_type = MMOP_OFFLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO))
+ mhp_default_online_type = MMOP_ONLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL))
+ mhp_default_online_type = MMOP_ONLINE_KERNEL;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE))
+ mhp_default_online_type = MMOP_ONLINE_MOVABLE;
+ else
+ mhp_default_online_type = MMOP_OFFLINE;
+
+ return mhp_default_online_type;
+}
+
+void mhp_set_default_online_type(int online_type)
+{
+ mhp_default_online_type = online_type;
+}
static int __init setup_memhp_default_state(char *str)
{
@@ -650,6 +670,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* this and the first chunk to online will be pageblock_nr_pages.
*/
for (pfn = start_pfn; pfn < end_pfn;) {
+ struct page *page = pfn_to_page(pfn);
int order;
/*
@@ -664,7 +685,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
else
order = MAX_PAGE_ORDER;
- (*online_page_callback)(pfn_to_page(pfn), order);
+ /*
+ * Exposing the page to the buddy by freeing can cause
+ * issues with debug_pagealloc enabled: some archs don't
+ * like double-unmappings. So treat them like any pages that
+ * were allocated from the buddy.
+ */
+ debug_pagealloc_map_pages(page, 1 << order);
+ (*online_page_callback)(page, order);
pfn += (1UL << order);
}
@@ -672,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
online_mem_sections(start_pfn, end_pfn);
}
-/* check which state of node_states will be changed when online memory */
-static void node_states_check_changes_online(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- int nid = zone_to_nid(zone);
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
- if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
- arg->status_change_nid_normal = nid;
-}
-
-static void node_states_set_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_set_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_set_state(node, N_MEMORY);
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -743,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn)
*/
void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -770,12 +775,13 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
/*
* TODO now we have a visible range of pages which are not associated
- * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * with their zone properly. Not nice but set_pfnblock_migratetype()
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
- MEMINIT_HOTPLUG, altmap, migratetype);
+ MEMINIT_HOTPLUG, altmap, migratetype,
+ isolate_pageblock);
set_zone_contiguous(zone);
}
@@ -1100,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (mhp_off_inaccessible)
page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
+ false);
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(pfn + i);
@@ -1146,11 +1153,17 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- unsigned long flags;
- int need_zonelists_rebuild = 0;
+ struct memory_notify mem_arg = {
+ .start_pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
const int nid = zone_to_nid(zone);
+ int need_zonelists_rebuild = 0;
+ unsigned long flags;
int ret;
- struct memory_notify arg;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1165,13 +1178,19 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
/* associate pfn range with the zone */
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
-
- arg.start_pfn = pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_online(nr_pages, zone, &arg);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
+ true);
+
+ if (!node_state(nid, N_MEMORY)) {
+ /* Adding memory to the node for the first time */
+ node_arg.nid = nid;
+ ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+ }
- ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret)
goto failed_addition;
@@ -1197,12 +1216,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- node_states_set_node(nid, &arg);
+ if (node_arg.nid >= 0)
+ node_set_state(nid, N_MEMORY);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
/* Basic onlining is complete, allow allocation of onlined pages. */
- undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+ undo_isolate_page_range(pfn, pfn + nr_pages);
/*
* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1218,16 +1238,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ if (node_arg.nid >= 0)
+ /* First memory added successfully. Notify consumers. */
+ node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
+
writeback_set_ratelimit();
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &mem_arg);
return 0;
failed_addition:
pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
+ memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg);
remove_pfn_range_from_zone(zone, pfn, nr_pages);
return ret;
}
@@ -1320,7 +1346,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = mhp_default_online_type;
+ mem->online_type = mhp_get_default_online_type();
return device_online(&mem->dev);
}
@@ -1544,13 +1570,12 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* We online node here. We can't roll back from here.
*/
node_set_online(nid);
- ret = __register_one_node(nid);
+ ret = register_one_node(nid);
BUG_ON(ret);
}
- register_memory_blocks_under_node(nid, PFN_DOWN(start),
- PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
+ register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start),
+ PFN_UP(start + size - 1));
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1567,7 +1592,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
merge_system_ram_resource(res);
/* online pages if requested */
- if (mhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_get_default_online_type() != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1714,8 +1739,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). Will skip over most unmovable
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU and
+ * hugetlb folio, movable_ops pages). Will skip over most unmovable
* pages (esp., pages that can be skipped when offlining), but bail out on
* definitely unmovable pages.
*
@@ -1729,20 +1754,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
{
unsigned long pfn;
- for (pfn = start; pfn < end; pfn++) {
+ for_each_valid_pfn(pfn, start, end) {
struct page *page;
struct folio *folio;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
- if (PageLRU(page))
- goto found;
- if (__PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
goto found;
/*
- * PageOffline() pages that are not marked __PageMovable() and
+ * PageOffline() pages that do not have movable_ops and
* have a reference count > 0 (after MEM_GOING_OFFLINE) are
* definitely unmovable. If their reference count would be 0,
* they could at least be skipped when offlining memory.
@@ -1778,33 +1799,30 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for_each_valid_pfn(pfn, start_pfn, end_pfn) {
struct page *page;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
- /*
- * No reference or lock is held on the folio, so it might
- * be modified concurrently (e.g. split). As such,
- * folio_nr_pages() may read garbage. This is fine as the outer
- * loop will revisit the split folio later.
- */
- if (folio_test_large(folio))
- pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
-
if (!folio_try_get(folio))
continue;
if (unlikely(page_folio(page) != folio))
goto put_folio;
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
- if (WARN_ON(folio_test_lru(folio)))
- folio_isolate_lru(folio);
+ if (folio_test_large(folio))
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
+
+ if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() cannot handle large folios
+ * in all cases yet.
+ */
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ goto put_folio;
+ if (folio_test_lru(folio) && !folio_isolate_lru(folio))
+ goto put_folio;
if (folio_mapped(folio)) {
folio_lock(folio);
unmap_poisoned_folio(folio, pfn, false);
@@ -1828,7 +1846,7 @@ put_folio:
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
.reason = MR_MEMORY_HOTPLUG,
};
int ret;
@@ -1870,54 +1888,6 @@ static int __init cmdline_parse_movable_node(char *p)
}
early_param("movable_node", cmdline_parse_movable_node);
-/* check which state of node_states will be changed when offline memory */
-static void node_states_check_changes_offline(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- /*
- * Check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is within the range
- * [0..ZONE_NORMAL], and it is the last present memory there,
- * the zones in that range will become empty after the offlining,
- * thus we can determine that we need to clear the node from
- * node_states[N_NORMAL_MEMORY].
- */
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
- arg->status_change_nid_normal = zone_to_nid(zone);
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
- * does not apply as we don't support 32bit.
- * Here we count the possible pages from ZONE_MOVABLE.
- * If after having accounted all the pages, we see that the nr_pages
- * to be offlined is over or equal to the accounted pages,
- * we know that the node will become empty, and so, we can clear
- * it for N_MEMORY as well.
- */
- present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
-
- if (nr_pages >= present_pages)
- arg->status_change_nid = zone_to_nid(zone);
-}
-
-static void node_states_clear_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_clear_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_clear_state(node, N_MEMORY);
-}
-
static int count_system_ram_pages_cb(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
@@ -1933,11 +1903,18 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn, managed_pages, system_ram_pages = 0;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
const int node = zone_to_nid(zone);
+ struct memory_notify mem_arg = {
+ .start_pfn = start_pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
unsigned long flags;
- struct memory_notify arg;
char *reason;
int ret;
@@ -1989,19 +1966,28 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
- MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE,
- GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
+ PB_ISOLATE_MODE_MEM_OFFLINE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
}
- arg.start_pfn = start_pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_offline(nr_pages, zone, &arg);
+ /*
+ * Check whether the node will have no present pages after we offline
+ * 'nr_pages' more. If so, we know that the node will become empty, and
+ * so we will clear N_MEMORY for it.
+ */
+ if (nr_pages >= pgdat->node_present_pages) {
+ node_arg.nid = node;
+ ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ reason = "node notifier failure";
+ goto failed_removal_isolated;
+ }
+ }
- ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret) {
reason = "notifier failure";
@@ -2050,7 +2036,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_isolated;
}
- ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+ ret = test_pages_isolated(start_pfn, end_pfn,
+ PB_ISOLATE_MODE_MEM_OFFLINE);
} while (ret);
@@ -2081,27 +2068,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- node_states_clear_node(node, &arg);
+ if (node_arg.nid >= 0)
+ node_clear_state(node, N_MEMORY);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- if (arg.status_change_nid >= 0) {
+ if (node_arg.nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
+ /* Node went memoryless. Notify consumers */
+ node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
}
writeback_set_ratelimit();
- memory_notify(MEM_OFFLINE, &arg);
+ memory_notify(MEM_OFFLINE, &mem_arg);
remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
return 0;
failed_removal_isolated:
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ undo_isolate_page_range(start_pfn, end_pfn);
+ memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
failed_removal_pcplists_disabled:
lru_cache_enable();
zone_pcp_enable(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 162407fbf2bc..eb83cff7db8c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,10 +109,12 @@
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
@@ -139,31 +141,138 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
*/
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
+ */
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
+ */
+static DEFINE_MUTEX(wi_state_lock);
static u8 get_il_weight(int node)
{
- u8 *table;
- u8 weight;
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- /* if no iw_table, use system default */
- weight = table ? table[node] : 1;
- /* if value in iw_table is 0, use system default */
- weight = weight ? weight : 1;
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
rcu_read_unlock();
return weight;
}
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -196,6 +305,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -542,6 +682,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -555,7 +696,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -567,6 +710,8 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
@@ -599,7 +744,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
- qp->nr_failed++;
+ qp->nr_failed += nr;
if (strictly_unmovable(flags))
break;
}
@@ -642,12 +787,12 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
- (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
- if (!isolate_hugetlb(folio, qp->pagelist))
+ (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
+ if (!folio_isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
spin_unlock(ptl);
@@ -1033,10 +1178,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
@@ -1983,26 +2128,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
+ struct weighted_interleave_state *state;
nodemask_t nodemask;
unsigned int target, nr_nodes;
- u8 *table;
+ u8 *table = NULL;
unsigned int weight_total = 0;
u8 weight;
- int nid;
+ int nid = 0;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
- table = rcu_dereference(iw_table);
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
/* calculate the total weight */
- for_each_node_mask(nid, nodemask) {
- /* detect system default usage */
- weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
- weight_total += weight;
- }
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
/* Calculate the node offset based on totals */
target = ilx % weight_total;
@@ -2010,7 +2157,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
@@ -2205,9 +2351,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
if (!page)
- page = __alloc_pages_noprof(gfp, order, nid, NULL);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
return page;
}
@@ -2222,7 +2368,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
+static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
nodemask_t *nodemask;
@@ -2253,8 +2399,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node_noprof(nid,
- gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ page = __alloc_frozen_pages_noprof(
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order,
+ nid, NULL);
if (page || !(gfp & __GFP_DIRECT_RECLAIM))
return page;
/*
@@ -2266,7 +2413,7 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
}
}
- page = __alloc_pages_noprof(gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
if (unlikely(pol->mode == MPOL_INTERLEAVE ||
pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
@@ -2285,8 +2432,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
- return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP,
- order, pol, ilx, nid));
+ struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
+ ilx, nid);
+ if (!page)
+ return NULL;
+
+ set_page_refcounted(page);
+ return page_rmappable_folio(page);
}
/**
@@ -2300,7 +2452,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
* VMA to prevent it from going away. Should be used for all allocations
* for folios that will be mapped into user space, excepting hugetlbfs, and
- * excepting where direct use of alloc_pages_mpol() is more appropriate.
+ * excepting where direct use of folio_alloc_mpol() is more appropriate.
*
* Return: The folio on success or NULL if allocation fails.
*/
@@ -2321,6 +2473,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);
+struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = &default_policy;
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
+ numa_node_id());
+}
+
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
@@ -2337,17 +2504,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof);
*/
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
- struct mempolicy *pol = &default_policy;
+ struct page *page = alloc_frozen_pages_noprof(gfp, order);
- /*
- * No reference counting needed for current->mempolicy
- * nor system default_policy
- */
- if (!in_interrupt() && !(gfp & __GFP_THISNODE))
- pol = get_task_policy(current);
-
- return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
- numa_node_id());
+ if (page)
+ set_page_refcounted(page);
+ return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);
@@ -2357,7 +2518,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
}
EXPORT_SYMBOL(folio_alloc_noprof);
-static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2376,13 +2537,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
if (delta) {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node + 1, NULL,
+ nr_pages_per_node + 1,
page_array);
delta--;
} else {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node, NULL, page_array);
+ nr_pages_per_node, page_array);
}
page_array += nr_allocated;
@@ -2392,17 +2553,18 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
+ struct weighted_interleave_state *state;
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
- u8 *table, *weights, weight;
+ u8 *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
@@ -2431,7 +2593,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (weight && node_isset(node, nodes)) {
node_pages = min(rem_pages, weight);
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
/* if that's all the pages, no need to interleave */
@@ -2452,17 +2614,19 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- if (table)
- memcpy(weights, table, nr_node_ids);
- rcu_read_unlock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
/* calculate total, detect system default usage */
- for_each_node_mask(node, nodes) {
- if (!weights[node])
- weights[node] = 1;
+ for_each_node_mask(node, nodes)
weight_total += weights[node];
- }
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -2494,7 +2658,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (!node_pages)
break;
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
if (total_allocated == nr_pages)
@@ -2507,7 +2671,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2518,11 +2682,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
if (nr_allocated < nr_pages)
nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
- nr_pages - nr_allocated, NULL,
+ nr_pages - nr_allocated,
page_array + nr_allocated);
return nr_allocated;
}
@@ -2533,7 +2697,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
@@ -2544,21 +2708,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_pages_bulk_array_interleave(gfp, pol,
+ return alloc_pages_bulk_interleave(gfp, pol,
nr_pages, page_array);
if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
- return alloc_pages_bulk_array_weighted_interleave(
+ return alloc_pages_bulk_weighted_interleave(
gfp, pol, nr_pages, page_array);
if (pol->mode == MPOL_PREFERRED_MANY)
- return alloc_pages_bulk_array_preferred_many(gfp,
+ return alloc_pages_bulk_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
return alloc_pages_bulk_noprof(gfp, nid, nodemask,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -3373,6 +3537,14 @@ struct iw_node_attr {
int nid;
};
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -3387,177 +3559,310 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
struct iw_node_attr *node_attr;
- u8 *new;
- u8 *old;
u8 weight = 0;
+ int i;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
- if (count == 0 || sysfs_streq(buf, ""))
- weight = 0;
- else if (kstrtou8(buf, 0, &weight))
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
return -EINVAL;
- new = kzalloc(nr_node_ids, GFP_KERNEL);
- if (!new)
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
return -ENOMEM;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
return count;
}
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- if (!node_attr)
- return;
- sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
int i;
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
{
- struct iw_node_attr *node_attr;
+ int ret;
char *name;
+ struct iw_node_attr *new_attr;
- node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
- if (!node_attr)
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
- kfree(node_attr);
+ kfree(new_attr);
return -ENOMEM;
}
- sysfs_attr_init(&node_attr->kobj_attr.attr);
- node_attr->kobj_attr.attr.name = name;
- node_attr->kobj_attr.attr.mode = 0644;
- node_attr->kobj_attr.show = node_show;
- node_attr->kobj_attr.store = node_store;
- node_attr->nid = nid;
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
- if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
- pr_err("failed to add attribute to weighted_interleave\n");
- return -ENOMEM;
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
}
- node_attrs[nid] = node_attr;
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
}
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct node_notify *nn = data;
+ int nid = nn->nid;
+
+ switch (action) {
+ case NODE_ADDED_FIRST_MEMORY:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case NODE_REMOVED_LAST_MEMORY:
+ sysfs_wi_node_delete(nid);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
- struct kobject *wi_kobj;
int nid, err;
- wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
- err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
- for_each_node_state(nid, N_POSSIBLE) {
- err = add_weight_node(nid, wi_kobj);
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
if (err) {
- pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
- return 0;
-}
-static void mempolicy_kobj_release(struct kobject *kobj)
-{
- u8 *old;
+ hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
- kfree(kobj);
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
-static const struct kobj_type mempolicy_ktype = {
- .release = mempolicy_kobj_release
-};
-
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
- mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
- err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
- "mempolicy");
+ err = add_weighted_interleave_group(mempolicy_kobj);
if (err)
- goto node_out;
+ goto err_kobj;
- err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ return 0;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
return err;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 3223337135d0..d7bbf1189db9 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -68,10 +68,20 @@ static void check_element(mempool_t *pool, void *element)
} else if (pool->free == mempool_free_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
- void *addr = kmap_local_page((struct page *)element);
- __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
- kunmap_local(addr);
+#ifdef CONFIG_HIGHMEM
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *page = (struct page *)element;
+ void *addr = kmap_local_page(page + i);
+
+ __check_element(pool, addr, PAGE_SIZE);
+ kunmap_local(addr);
+ }
+#else
+ void *addr = page_address((struct page *)element);
+
+ __check_element(pool, addr, PAGE_SIZE << order);
+#endif
}
}
@@ -97,10 +107,20 @@ static void poison_element(mempool_t *pool, void *element)
} else if (pool->alloc == mempool_alloc_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
- void *addr = kmap_local_page((struct page *)element);
- __poison_element(addr, 1UL << (PAGE_SHIFT + order));
- kunmap_local(addr);
+#ifdef CONFIG_HIGHMEM
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *page = (struct page *)element;
+ void *addr = kmap_local_page(page + i);
+
+ __poison_element(addr, PAGE_SIZE);
+ kunmap_local(addr);
+ }
+#else
+ void *addr = page_address((struct page *)element);
+
+ __poison_element(addr, PAGE_SIZE << order);
+#endif
}
}
#else /* CONFIG_SLUB_DEBUG_ON */
@@ -136,7 +156,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
static __always_inline void add_element(mempool_t *pool, void *element)
{
- BUG_ON(pool->curr_nr >= pool->min_nr);
+ BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element;
@@ -202,16 +222,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
pool->alloc = alloc_fn;
pool->free = free_fn;
init_waitqueue_head(&pool->wait);
-
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ /*
+ * max() used here to ensure storage for at least 1 element to support
+ * zero minimum pool
+ */
+ pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
gfp_mask, node_id);
if (!pool->elements)
return -ENOMEM;
/*
- * First pre-allocate the guaranteed number of buffers.
+ * First pre-allocate the guaranteed number of buffers,
+ * also pre-allocate 1 element for zero minimum pool.
*/
- while (pool->curr_nr < pool->min_nr) {
+ while (pool->curr_nr < max(1, pool->min_nr)) {
void *element;
element = pool->alloc(gfp_mask, pool->pool_data);
@@ -540,11 +564,35 @@ void mempool_free(void *element, mempool_t *pool)
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
- wake_up(&pool->wait);
+ if (wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
+ return;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ /*
+ * Handle the min_nr = 0 edge case:
+ *
+ * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
+ * so waiters sleeping on pool->wait would never be woken by the
+ * wake-up path of previous test. This explicit check ensures the
+ * allocation of element when both min_nr and curr_nr are 0, and
+ * any active waiters are properly awakened.
+ */
+ if (unlikely(pool->min_nr == 0 &&
+ READ_ONCE(pool->curr_nr) == 0)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr == 0)) {
+ add_element(pool, element);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
return;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
+
pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
diff --git a/mm/memremap.c b/mm/memremap.c
index 40d4547ce514..b0ce0d8254bd 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -5,7 +5,6 @@
#include <linux/kasan.h>
#include <linux/memory_hotplug.h>
#include <linux/memremap.h>
-#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -39,30 +38,6 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif
-#ifdef CONFIG_FS_DAX
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-
-static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
-{
- if (pgmap->type == MEMORY_DEVICE_FS_DAX)
- static_branch_dec(&devmap_managed_key);
-}
-
-static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
-{
- if (pgmap->type == MEMORY_DEVICE_FS_DAX)
- static_branch_inc(&devmap_managed_key);
-}
-#else
-static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
-{
-}
-static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
-{
-}
-#endif /* CONFIG_FS_DAX */
-
static void pgmap_array_delete(struct range *range)
{
xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
@@ -130,7 +105,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
pgmap_array_delete(range);
}
@@ -151,7 +126,6 @@ void memunmap_pages(struct dev_pagemap *pgmap)
percpu_ref_exit(&pgmap->ref);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
- devmap_managed_enable_put(pgmap);
}
EXPORT_SYMBOL_GPL(memunmap_pages);
@@ -211,8 +185,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
- range_len(range));
+ error = pfnmap_track(PHYS_PFN(range->start), range_len(range),
+ &params->pgprot);
if (error)
goto err_pfn_remap;
@@ -254,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), params->altmap,
- MIGRATE_MOVABLE);
+ MIGRATE_MOVABLE, false);
}
mem_hotplug_done();
@@ -277,7 +251,7 @@ err_add_memory:
if (!is_private)
kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
pgmap_array_delete(range);
return error;
@@ -332,10 +306,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_FS_DAX:
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
- WARN(1, "File system DAX not supported\n");
- return ERR_PTR(-EINVAL);
- }
params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
@@ -354,8 +324,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
if (error)
return ERR_PTR(error);
- devmap_managed_enable_get(pgmap);
-
/*
* Clear the pgmap nr_range as it will be incremented for each
* successfully processed range. This communicates how many
@@ -458,8 +426,9 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_folio(struct folio *folio)
{
- if (WARN_ON_ONCE(!folio->page.pgmap->ops ||
- !folio->page.pgmap->ops->page_free))
+ struct dev_pagemap *pgmap = folio->pgmap;
+
+ if (WARN_ON_ONCE(!pgmap))
return;
mem_cgroup_uncharge(folio);
@@ -484,19 +453,42 @@ void free_zone_device_folio(struct folio *folio)
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
* to clear folio->mapping.
+ *
+ * FS DAX pages clear the mapping when the folio->share count hits
+ * zero which indicating the page has been removed from the file
+ * system mapping.
*/
- folio->mapping = NULL;
- folio->page.pgmap->ops->page_free(folio_page(folio, 0));
+ if (pgmap->type != MEMORY_DEVICE_FS_DAX &&
+ pgmap->type != MEMORY_DEVICE_GENERIC)
+ folio->mapping = NULL;
- if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
- folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ break;
+ pgmap->ops->page_free(folio_page(folio, 0));
+ put_dev_pagemap(pgmap);
+ break;
+
+ case MEMORY_DEVICE_GENERIC:
/*
* Reset the refcount to 1 to prepare for handing out the page
* again.
*/
folio_set_count(folio, 1);
- else
- put_dev_pagemap(folio->page.pgmap);
+ break;
+
+ case MEMORY_DEVICE_FS_DAX:
+ wake_up_var(&folio->page);
+ break;
+
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ break;
+ pgmap->ops->page_free(folio_page(folio, 0));
+ break;
+ }
}
void zone_device_page_init(struct page *page)
@@ -505,26 +497,8 @@ void zone_device_page_init(struct page *page)
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
*/
- WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
set_page_count(page, 1);
lock_page(page);
}
EXPORT_SYMBOL_GPL(zone_device_page_init);
-
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX)
- return false;
-
- /*
- * fsdax page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (folio_ref_sub_return(folio, refs) == 1)
- wake_up_var(&folio->_refcount);
- return true;
-}
-EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
-#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index cc68583c86f9..db92d6d8e510 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,7 +35,6 @@
#include <linux/compat.h>
#include <linux/hugetlb.h>
#include <linux/gfp.h>
-#include <linux/pfn_t.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
@@ -50,9 +49,73 @@
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+static const struct movable_operations *offline_movable_ops;
+static const struct movable_operations *zsmalloc_movable_ops;
+
+int set_movable_ops(const struct movable_operations *ops, enum pagetype type)
+{
+ /*
+ * We only allow for selected types and don't handle concurrent
+ * registration attempts yet.
+ */
+ switch (type) {
+ case PGTY_offline:
+ if (offline_movable_ops && ops)
+ return -EBUSY;
+ offline_movable_ops = ops;
+ break;
+ case PGTY_zsmalloc:
+ if (zsmalloc_movable_ops && ops)
+ return -EBUSY;
+ zsmalloc_movable_ops = ops;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(set_movable_ops);
+
+static const struct movable_operations *page_movable_ops(struct page *page)
+{
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+
+ /*
+ * If we enable page migration for a page of a certain type by marking
+ * it as movable, the page type must be sticky until the page gets freed
+ * back to the buddy.
+ */
+ if (PageOffline(page))
+ /* Only balloon compaction sets PageOffline pages movable. */
+ return offline_movable_ops;
+ if (PageZsmalloc(page))
+ return zsmalloc_movable_ops;
+
+ return NULL;
+}
+
+/**
+ * isolate_movable_ops_page - isolate a movable_ops page for migration
+ * @page: The page.
+ * @mode: The isolation mode.
+ *
+ * Try to isolate a movable_ops page for migration. Will fail if the page is
+ * not a movable_ops page, if the page is already isolated for migration
+ * or if the page was just was released by its owner.
+ *
+ * Once isolated, the page cannot get freed until it is either putback
+ * or migrated.
+ *
+ * Returns true if isolation succeeded, otherwise false.
+ */
+bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
{
+ /*
+ * TODO: these pages will not be folios in the future. All
+ * folio dependencies will have to be removed.
+ */
struct folio *folio = folio_get_nontail_page(page);
const struct movable_operations *mops;
@@ -68,20 +131,15 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (!folio)
goto out;
- if (unlikely(folio_test_slab(folio)))
- goto out_putfolio;
- /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
- smp_rmb();
/*
- * Check movable flag before taking the page lock because
+ * Check for movable_ops pages before taking the page lock because
* we use non-atomic bitops on newly allocated page flags so
* unconditionally grabbing the lock ruins page's owner side.
+ *
+ * Note that once a page has movable_ops, it will stay that way
+ * until the page was freed.
*/
- if (unlikely(!__folio_test_movable(folio)))
- goto out_putfolio;
- /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
- smp_rmb();
- if (unlikely(folio_test_slab(folio)))
+ if (unlikely(!page_has_movable_ops(page)))
goto out_putfolio;
/*
@@ -98,18 +156,20 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (unlikely(!folio_trylock(folio)))
goto out_putfolio;
- if (!folio_test_movable(folio) || folio_test_isolated(folio))
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+ if (PageMovableOpsIsolated(page))
goto out_no_isolated;
- mops = folio_movable_ops(folio);
- VM_BUG_ON_FOLIO(!mops, folio);
+ mops = page_movable_ops(page);
+ if (WARN_ON_ONCE(!mops))
+ goto out_no_isolated;
- if (!mops->isolate_page(&folio->page, mode))
+ if (!mops->isolate_page(page, mode))
goto out_no_isolated;
/* Driver shouldn't use the isolated flag */
- WARN_ON_ONCE(folio_test_isolated(folio));
- folio_set_isolated(folio);
+ VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
+ SetPageMovableOpsIsolated(page);
folio_unlock(folio);
return true;
@@ -122,12 +182,68 @@ out:
return false;
}
-static void putback_movable_folio(struct folio *folio)
+/**
+ * putback_movable_ops_page - putback an isolated movable_ops page
+ * @page: The isolated page.
+ *
+ * Putback an isolated movable_ops page.
+ *
+ * After the page was putback, it might get freed instantly.
+ */
+static void putback_movable_ops_page(struct page *page)
{
- const struct movable_operations *mops = folio_movable_ops(folio);
+ /*
+ * TODO: these pages will not be folios in the future. All
+ * folio dependencies will have to be removed.
+ */
+ struct folio *folio = page_folio(page);
- mops->putback_page(&folio->page);
- folio_clear_isolated(folio);
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+ VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page);
+ folio_lock(folio);
+ page_movable_ops(page)->putback_page(page);
+ ClearPageMovableOpsIsolated(page);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+/**
+ * migrate_movable_ops_page - migrate an isolated movable_ops page
+ * @dst: The destination page.
+ * @src: The source page.
+ * @mode: The migration mode.
+ *
+ * Migrate an isolated movable_ops page.
+ *
+ * If the src page was already released by its owner, the src page is
+ * un-isolated (putback) and migration succeeds; the migration core will be the
+ * owner of both pages.
+ *
+ * If the src page was not released by its owner and the migration was
+ * successful, the owner of the src page and the dst page are swapped and
+ * the src page is un-isolated.
+ *
+ * If migration fails, the ownership stays unmodified and the src page
+ * remains isolated: migration may be retried later or the page can be putback.
+ *
+ * TODO: migration core will treat both pages as folios and lock them before
+ * this call to unlock them after this call. Further, the folio refcounts on
+ * src and dst are also released by migration core. These pages will not be
+ * folios in the future, so that must be reworked.
+ *
+ * Returns 0 on success, otherwise a negative error code.
+ */
+static int migrate_movable_ops_page(struct page *dst, struct page *src,
+ enum migrate_mode mode)
+{
+ int rc;
+
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
+ VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
+ rc = page_movable_ops(src)->migrate_page(dst, src, mode);
+ if (!rc)
+ ClearPageMovableOpsIsolated(src);
+ return rc;
}
/*
@@ -136,7 +252,7 @@ static void putback_movable_folio(struct folio *folio)
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_hugetlb().
+ * and folio_isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -145,24 +261,12 @@ void putback_movable_pages(struct list_head *l)
list_for_each_entry_safe(folio, folio2, l, lru) {
if (unlikely(folio_test_hugetlb(folio))) {
- folio_putback_active_hugetlb(folio);
+ folio_putback_hugetlb(folio);
continue;
}
list_del(&folio->lru);
- /*
- * We isolated non-lru movable folio so here we can use
- * __folio_test_movable because LRU folio's mapping cannot
- * have PAGE_MAPPING_MOVABLE.
- */
- if (unlikely(__folio_test_movable(folio))) {
- VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
- folio_lock(folio);
- if (folio_test_movable(folio))
- putback_movable_folio(folio);
- else
- folio_clear_isolated(folio);
- folio_unlock(folio);
- folio_put(folio);
+ if (unlikely(page_has_movable_ops(&folio->page))) {
+ putback_movable_ops_page(&folio->page);
} else {
node_stat_mod_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio), -folio_nr_pages(folio));
@@ -174,43 +278,35 @@ void putback_movable_pages(struct list_head *l)
/* Must be called with an elevated refcount on the non-hugetlb folio */
bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
{
- bool isolated, lru;
-
if (folio_test_hugetlb(folio))
- return isolate_hugetlb(folio, list);
-
- lru = !__folio_test_movable(folio);
- if (lru)
- isolated = folio_isolate_lru(folio);
- else
- isolated = isolate_movable_page(&folio->page,
- ISOLATE_UNEVICTABLE);
-
- if (!isolated)
- return false;
+ return folio_isolate_hugetlb(folio, list);
- list_add(&folio->lru, list);
- if (lru)
+ if (page_has_movable_ops(&folio->page)) {
+ if (!isolate_movable_ops_page(&folio->page,
+ ISOLATE_UNEVICTABLE))
+ return false;
+ } else {
+ if (!folio_isolate_lru(folio))
+ return false;
node_stat_add_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio));
-
+ }
+ list_add(&folio->lru, list);
return true;
}
static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
- struct folio *folio,
- unsigned long idx)
+ struct folio *folio, pte_t old_pte, unsigned long idx)
{
struct page *page = folio_page(folio, idx);
- bool contains_data;
pte_t newpte;
- void *addr;
- if (PageCompound(page))
+ if (PageCompound(page) || PageHWPoison(page))
return false;
+
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+ VM_BUG_ON_PAGE(pte_present(old_pte), page);
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm))
@@ -221,15 +317,17 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
* this subpage has been non present. If the subpage is only zero-filled
* then map it to the shared zeropage.
*/
- addr = kmap_local_page(page);
- contains_data = memchr_inv(addr, 0, PAGE_SIZE);
- kunmap_local(addr);
-
- if (contains_data)
+ if (!pages_identical(page, ZERO_PAGE(0)))
return false;
newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
pvmw->vma->vm_page_prot));
+
+ if (pte_swp_soft_dirty(old_pte))
+ newpte = pte_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(old_pte))
+ newpte = pte_mkuffd_wp(newpte);
+
set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
@@ -272,13 +370,13 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ old_pte = ptep_get(pvmw.pte);
if (rmap_walk_arg->map_unused_to_zeropage &&
- try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+ try_to_map_unused_to_zeropage(&pvmw, folio, old_pte, idx))
continue;
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
- old_pte = ptep_get(pvmw.pte);
entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
@@ -336,7 +434,7 @@ static bool remove_migration_pte(struct folio *folio,
folio_add_file_rmap_pte(folio, new, vma);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
- if (vma->vm_flags & VM_LOCKED)
+ if (READ_ONCE(vma->vm_flags) & VM_LOCKED)
mlock_drain_local();
trace_remove_migration_pte(pvmw.address, pte_val(pte),
@@ -453,20 +551,6 @@ unlock:
}
#endif
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
-
/*
* Replace the folio in the mapping.
*
@@ -502,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
oldzone = folio_zone(folio);
@@ -526,15 +610,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
@@ -605,13 +687,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
local_irq_enable();
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
@@ -628,7 +710,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src));
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -654,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_unlock_irq(&xas);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/*
@@ -759,7 +841,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, void *src_private,
enum migrate_mode mode)
{
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
@@ -770,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
return rc;
rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (src_private)
folio_attach_private(dst, folio_detach_private(src));
folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/**
@@ -847,7 +929,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = folio_expected_refs(mapping, src);
+ expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -855,9 +937,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return -EAGAIN;
if (check_refs) {
- bool busy;
+ bool busy, migrating;
bool invalidated = false;
+ migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
+ VM_WARN_ON_ONCE(migrating);
recheck_buffers:
busy = false;
spin_lock(&mapping->i_private_lock);
@@ -869,12 +953,12 @@ recheck_buffers:
}
bh = bh->b_this_page;
} while (bh != head);
+ spin_unlock(&mapping->i_private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
- spin_unlock(&mapping->i_private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
@@ -882,7 +966,7 @@ recheck_buffers:
}
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
goto unlock_buffers;
bh = head;
@@ -893,7 +977,7 @@ recheck_buffers:
unlock_buffers:
if (check_refs)
- spin_unlock(&mapping->i_private_lock);
+ clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head;
do {
unlock_buffer(bh);
@@ -955,66 +1039,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
@@ -1023,91 +1061,54 @@ static int fallback_migrate_folio(struct address_space *mapping,
}
/*
- * Move a page to a newly allocated page
- * The page is locked and all ptes have been successfully removed.
+ * Move a src folio to a newly allocated dst folio.
+ *
+ * The src and dst folios are locked and the src folios was unmapped from
+ * the page tables.
*
- * The new page will have replaced the old page if this function
- * is successful.
+ * On success, the src folio was replaced by the dst folio.
*
* Return value:
* < 0 - error code
- * MIGRATEPAGE_SUCCESS - success
+ * 0 - success
*/
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
+ struct address_space *mapping = folio_mapping(src);
int rc = -EAGAIN;
- bool is_lru = !__folio_test_movable(src);
VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
- if (likely(is_lru)) {
- struct address_space *mapping = folio_mapping(src);
-
- if (!mapping)
- rc = migrate_folio(mapping, dst, src, mode);
- else if (mapping_inaccessible(mapping))
- rc = -EOPNOTSUPP;
- else if (mapping->a_ops->migrate_folio)
- /*
- * Most folios have a mapping and most filesystems
- * provide a migrate_folio callback. Anonymous folios
- * are part of swap space which also has its own
- * migrate_folio callback. This is the most common path
- * for page migration.
- */
- rc = mapping->a_ops->migrate_folio(mapping, dst, src,
- mode);
- else
- rc = fallback_migrate_folio(mapping, dst, src, mode);
- } else {
- const struct movable_operations *mops;
-
+ if (!mapping)
+ rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping_inaccessible(mapping))
+ rc = -EOPNOTSUPP;
+ else if (mapping->a_ops->migrate_folio)
/*
- * In case of non-lru page, it could be released after
- * isolation step. In that case, we shouldn't try migration.
+ * Most folios have a mapping and most filesystems
+ * provide a migrate_folio callback. Anonymous folios
+ * are part of swap space which also has its own
+ * migrate_folio callback. This is the most common path
+ * for page migration.
*/
- VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
- if (!folio_test_movable(src)) {
- rc = MIGRATEPAGE_SUCCESS;
- folio_clear_isolated(src);
- goto out;
- }
-
- mops = folio_movable_ops(src);
- rc = mops->migrate_page(&dst->page, &src->page, mode);
- WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
- !folio_test_isolated(src));
- }
-
- /*
- * When successful, old pagecache src->mapping must be cleared before
- * src is freed; but stats require that PageAnon be left as PageAnon.
- */
- if (rc == MIGRATEPAGE_SUCCESS) {
- if (__folio_test_movable(src)) {
- VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
-
- /*
- * We clear PG_movable under page_lock so any compactor
- * cannot try to migrate this page.
- */
- folio_clear_isolated(src);
- }
+ rc = mapping->a_ops->migrate_folio(mapping, dst, src,
+ mode);
+ else
+ rc = fallback_migrate_folio(mapping, dst, src, mode);
+ if (!rc) {
/*
- * Anonymous and movable src->mapping will be cleared by
- * free_pages_prepare so don't reset it here for keeping
- * the type to work PageAnon, for example.
+ * For pagecache folios, src->mapping must be cleared before src
+ * is freed. Anonymous folios must stay anonymous until freed.
*/
- if (!folio_mapping_flags(src))
+ if (!folio_test_anon(src))
src->mapping = NULL;
if (likely(!folio_is_zone_device(dst)))
flush_dcache_folio(dst);
}
-out:
return rc;
}
@@ -1174,12 +1175,7 @@ static void migrate_folio_undo_dst(struct folio *dst, bool locked,
static void migrate_folio_done(struct folio *src,
enum migrate_reason reason)
{
- /*
- * Compaction can migrate also non-LRU pages which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __folio_test_movable
- */
- if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
+ if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION)
mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
folio_is_file_lru(src), -folio_nr_pages(src));
@@ -1192,26 +1188,15 @@ static void migrate_folio_done(struct folio *src,
static int migrate_folio_unmap(new_folio_t get_new_folio,
free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+ struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = data_race(!__folio_test_movable(src));
bool locked = false;
bool dst_locked = false;
- if (folio_ref_count(src) == 1) {
- /* Folio was freed from under us. So we are done. */
- folio_clear_active(src);
- folio_clear_unevictable(src);
- /* free_pages_prepare() will clear PG_isolated. */
- list_del(&src->lru);
- migrate_folio_done(src, reason);
- return MIGRATEPAGE_SUCCESS;
- }
-
dst = get_new_folio(src, private);
if (!dst)
return -ENOMEM;
@@ -1299,9 +1284,9 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
goto out;
dst_locked = true;
- if (unlikely(!is_lru)) {
+ if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
/*
@@ -1331,7 +1316,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (!folio_mapped(src)) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
out:
@@ -1358,20 +1343,23 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
int rc;
int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__folio_test_movable(src);
struct list_head *prev;
__migrate_folio_extract(dst, &old_page_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
+ if (unlikely(page_has_movable_ops(&src->page))) {
+ rc = migrate_movable_ops_page(&dst->page, &src->page, mode);
+ if (rc)
+ goto out;
+ goto out_unlock_both;
+ }
+
rc = move_to_new_folio(dst, src, mode);
if (rc)
goto out;
- if (unlikely(!is_lru))
- goto out_unlock_both;
-
/*
* When successful, push dst to LRU immediately: so that if it
* turns out to be an mlocked page, remove_migration_ptes() will
@@ -1390,7 +1378,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
out_unlock_both:
folio_unlock(dst);
- set_page_owner_migrate_reason(&dst->page, reason);
+ folio_set_owner_migrate_reason(dst, reason);
/*
* If migration is successful, decrease refcount of dst,
* which will not free the page because new page owner increased
@@ -1459,8 +1447,8 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
- folio_putback_active_hugetlb(src);
- return MIGRATEPAGE_SUCCESS;
+ folio_putback_hugetlb(src);
+ return 0;
}
dst = get_new_folio(src, private);
@@ -1523,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
+ remove_migration_ptes(src, !rc ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
@@ -1533,7 +1520,7 @@ put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
move_hugetlb_state(src, dst, reason);
put_new_folio = NULL;
}
@@ -1541,20 +1528,20 @@ put_anon:
out_unlock:
folio_unlock(src);
out:
- if (rc == MIGRATEPAGE_SUCCESS)
- folio_putback_active_hugetlb(src);
+ if (!rc)
+ folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
/*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, put_page() will drop the reference grabbed during
- * isolation.
+ * If migration was not successful and there's a freeing callback,
+ * return the folio to that special allocator. Otherwise, simply drop
+ * our additional reference.
*/
if (put_new_folio)
put_new_folio(dst, private);
else
- folio_putback_active_hugetlb(dst);
+ folio_put(dst);
return rc;
}
@@ -1651,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
reason, ret_folios);
/*
* The rules are:
- * Success: hugetlb folio will be put back
+ * 0: hugetlb folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1668,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
retry++;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
break;
default:
@@ -1695,6 +1682,81 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
return nr_failed;
}
+static void migrate_folios_move(struct list_head *src_folios,
+ struct list_head *dst_folios,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct list_head *ret_folios,
+ struct migrate_pages_stats *stats,
+ int *retry, int *thp_retry, int *nr_failed,
+ int *nr_retry_pages)
+{
+ struct folio *folio, *folio2, *dst, *dst2;
+ bool is_thp;
+ int nr_pages;
+ int rc;
+
+ dst = list_first_entry(dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ rc = migrate_folio_move(put_new_folio, private,
+ folio, dst, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * 0: folio will be freed
+ * -EAGAIN: stay on the unmap_folios list
+ * Other errno: put on ret_folios list
+ */
+ switch (rc) {
+ case -EAGAIN:
+ *retry += 1;
+ *thp_retry += is_thp;
+ *nr_retry_pages += nr_pages;
+ break;
+ case 0:
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ default:
+ *nr_failed += 1;
+ stats->nr_thp_failed += is_thp;
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+}
+
+static void migrate_folios_undo(struct list_head *src_folios,
+ struct list_head *dst_folios,
+ free_folio_t put_new_folio, unsigned long private,
+ struct list_head *ret_folios)
+{
+ struct folio *folio, *folio2, *dst, *dst2;
+
+ dst = list_first_entry(dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+ int old_page_state = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ anon_vma, true, ret_folios);
+ list_del(&dst->lru);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+}
+
/*
* migrate_pages_batch() first unmaps folios in the from list as many as
* possible, then move the unmapped folios.
@@ -1717,7 +1779,7 @@ static int migrate_pages_batch(struct list_head *from,
int pass = 0;
bool is_thp = false;
bool is_large = false;
- struct folio *folio, *folio2, *dst = NULL, *dst2;
+ struct folio *folio, *folio2, *dst = NULL;
int rc, rc_saved = 0, nr_pages;
LIST_HEAD(unmap_folios);
LIST_HEAD(dst_folios);
@@ -1796,14 +1858,27 @@ static int migrate_pages_batch(struct list_head *from,
continue;
}
+ /*
+ * If we are holding the last folio reference, the folio
+ * was freed from under us, so just drop our reference.
+ */
+ if (likely(!page_has_movable_ops(&folio->page)) &&
+ folio_ref_count(folio) == 1) {
+ folio_clear_active(folio);
+ folio_clear_unevictable(folio);
+ list_del(&folio->lru);
+ migrate_folio_done(folio, reason);
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ continue;
+ }
+
rc = migrate_folio_unmap(get_new_folio, put_new_folio,
- private, folio, &dst, mode, reason,
- ret_folios);
+ private, folio, &dst, mode, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
- * Unmap: folio will be put on unmap_folios list,
- * dst folio put on dst_folios list
+ * 0: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1853,11 +1928,7 @@ static int migrate_pages_batch(struct list_head *from,
thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- case MIGRATEPAGE_UNMAP:
+ case 0:
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios);
break;
@@ -1888,42 +1959,11 @@ move:
thp_retry = 0;
nr_retry_pages = 0;
- dst = list_first_entry(&dst_folios, struct folio, lru);
- dst2 = list_next_entry(dst, lru);
- list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
- nr_pages = folio_nr_pages(folio);
-
- cond_resched();
-
- rc = migrate_folio_move(put_new_folio, private,
- folio, dst, mode,
- reason, ret_folios);
- /*
- * The rules are:
- * Success: folio will be freed
- * -EAGAIN: stay on the unmap_folios list
- * Other errno: put on ret_folios list
- */
- switch(rc) {
- case -EAGAIN:
- retry++;
- thp_retry += is_thp;
- nr_retry_pages += nr_pages;
- break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- default:
- nr_failed++;
- stats->nr_thp_failed += is_thp;
- stats->nr_failed_pages += nr_pages;
- break;
- }
- dst = dst2;
- dst2 = list_next_entry(dst, lru);
- }
+ /* Move the unmapped folios */
+ migrate_folios_move(&unmap_folios, &dst_folios,
+ put_new_folio, private, mode, reason,
+ ret_folios, stats, &retry, &thp_retry,
+ &nr_failed, &nr_retry_pages);
}
nr_failed += retry;
stats->nr_thp_failed += thp_retry;
@@ -1932,20 +1972,8 @@ move:
rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
- dst = list_first_entry(&dst_folios, struct folio, lru);
- dst2 = list_next_entry(dst, lru);
- list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- int old_page_state = 0;
- struct anon_vma *anon_vma = NULL;
-
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
- anon_vma, true, ret_folios);
- list_del(&dst->lru);
- migrate_folio_undo_dst(dst, true, put_new_folio, private);
- dst = dst2;
- dst2 = list_next_entry(dst, lru);
- }
+ migrate_folios_undo(&unmap_folios, &dst_folios,
+ put_new_folio, private, ret_folios);
return rc;
}
@@ -2204,11 +2232,11 @@ static int __add_folio_for_migration(struct folio *folio, int node,
if (folio_nid(folio) == node)
return 0;
- if (folio_likely_mapped_shared(folio) && !migrate_all)
+ if (folio_maybe_mapped_shared(folio) && !migrate_all)
return -EACCES;
if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(folio, pagelist))
+ if (folio_isolate_hugetlb(folio, pagelist))
return 1;
} else if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, pagelist);
@@ -2354,13 +2382,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
/*
- * The move_pages() man page does not have an -EEXIST choice, so
- * use -EFAULT instead.
- */
- if (err == -EEXIST)
- err = -EFAULT;
-
- /*
* If the page is already on the target node (!err), store the
* node, otherwise, store the err.
*/
@@ -2434,6 +2455,7 @@ set_status:
static int get_compat_pages_array(const void __user *chunk_pages[],
const void __user * __user *pages,
+ unsigned long chunk_offset,
unsigned long chunk_nr)
{
compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
@@ -2441,7 +2463,7 @@ static int get_compat_pages_array(const void __user *chunk_pages[],
int i;
for (i = 0; i < chunk_nr; i++) {
- if (get_user(p, pages32 + i))
+ if (get_user(p, pages32 + chunk_offset + i))
return -EFAULT;
chunk_pages[i] = compat_ptr(p);
}
@@ -2460,27 +2482,28 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+ unsigned long chunk_offset = 0;
while (nr_pages) {
unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
if (in_compat_syscall()) {
if (get_compat_pages_array(chunk_pages, pages,
- chunk_nr))
+ chunk_offset, chunk_nr))
break;
} else {
- if (copy_from_user(chunk_pages, pages,
+ if (copy_from_user(chunk_pages, pages + chunk_offset,
chunk_nr * sizeof(*chunk_pages)))
break;
}
do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
- if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ if (copy_to_user(status + chunk_offset, chunk_status,
+ chunk_nr * sizeof(*status)))
break;
- pages += chunk_nr;
- status += chunk_nr;
+ chunk_offset += chunk_nr;
nr_pages -= chunk_nr;
}
return nr_pages ? -EFAULT : 0;
@@ -2629,11 +2652,10 @@ int migrate_misplaced_folio_prepare(struct folio *folio,
* processes with execute permissions as they are probably
* shared libraries.
*
- * See folio_likely_mapped_shared() on possible imprecision
+ * See folio_maybe_mapped_shared() on possible imprecision
* when we cannot easily detect if a folio is shared.
*/
- if ((vma->vm_flags & VM_EXEC) &&
- folio_likely_mapped_shared(folio))
+ if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio))
return -EACCES;
/*
@@ -2683,8 +2705,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio,
* elevated reference count on the folio. This function will un-isolate the
* folio, dereferencing the folio before returning.
*/
-int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
- int node)
+int migrate_misplaced_folio(struct folio *folio, int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int nr_remaining;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5bd888223cc8..abd9f6850db6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@ again:
folio_get(folio);
spin_unlock(ptl);
+ /* FIXME: we don't expect THP for fault_folio */
+ if (WARN_ON_ONCE(fault_folio == folio))
+ return migrate_vma_collect_skip(start, end,
+ walk);
if (unlikely(!folio_trylock(folio)))
return migrate_vma_collect_skip(start, end,
walk);
ret = split_folio(folio);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
if (ret)
return migrate_vma_collect_skip(start, end,
@@ -106,6 +113,7 @@ again:
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ struct dev_pagemap *pgmap;
unsigned long mpfn = 0, pfn;
struct folio *folio;
struct page *page;
@@ -133,9 +141,10 @@ again:
goto next;
page = pfn_swap_entry_to_page(entry);
+ pgmap = page_pgmap(page);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
- page->pgmap->owner != migrate->pgmap_owner)
+ pgmap->owner != migrate->pgmap_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
@@ -152,12 +161,16 @@ again:
}
page = vm_normal_page(migrate->vma, addr, pte);
if (page && !is_zone_device_page(page) &&
- !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
- else if (page && is_device_coherent_page(page) &&
- (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
- page->pgmap->owner != migrate->pgmap_owner))
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
goto next;
+ } else if (page && is_device_coherent_page(page)) {
+ pgmap = page_pgmap(page);
+
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ pgmap->owner != migrate->pgmap_owner)
+ goto next;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -192,7 +205,7 @@ again:
* optimisation to avoid walking the rmap later with
* try_to_migrate().
*/
- if (folio_trylock(folio)) {
+ if (fault_folio == folio || folio_trylock(folio)) {
bool anon_exclusive;
pte_t swp_pte;
@@ -204,7 +217,8 @@ again:
if (folio_try_share_anon_rmap_pte(folio, page)) {
set_pte_at(mm, addr, ptep, pte);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
mpfn = 0;
goto next;
@@ -363,6 +377,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i, restore = 0;
bool allow_drain = true;
unsigned long unmapped = 0;
@@ -427,7 +443,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
restore--;
}
@@ -536,6 +553,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (args->fault_page && !is_device_private_page(args->fault_page))
return -EINVAL;
+ if (args->fault_page && !PageLocked(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -596,7 +615,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ if (pmd_trans_huge(*pmdp))
goto abort;
if (pte_alloc(mm, pmdp))
goto abort;
@@ -759,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r != MIGRATEPAGE_SUCCESS)
+ if (r)
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
@@ -799,19 +818,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_pages);
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
- unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i;
for (i = 0; i < npages; i++) {
@@ -824,6 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!page) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -834,6 +848,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -843,15 +858,33 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!folio_is_zone_device(dst))
folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
- folio_unlock(src);
+ if (fault_folio != src)
+ folio_unlock(src);
folio_put(src);
if (dst != src) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
}
}
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
+{
+ return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
EXPORT_SYMBOL(migrate_device_finalize);
/**
@@ -867,10 +900,27 @@ EXPORT_SYMBOL(migrate_device_finalize);
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
{
- migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+ __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+ migrate->fault_page);
}
EXPORT_SYMBOL(migrate_vma_finalize);
+static unsigned long migrate_device_pfn_lock(unsigned long pfn)
+{
+ struct folio *folio;
+
+ folio = folio_get_nontail_page(pfn_to_page(pfn));
+ if (!folio)
+ return 0;
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return 0;
+ }
+
+ return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+}
+
/**
* migrate_device_range() - migrate device private pfns to normal memory.
* @src_pfns: array large enough to hold migrating source device private pfns.
@@ -895,29 +945,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
{
unsigned long i, pfn;
- for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct folio *folio;
+ for (pfn = start, i = 0; i < npages; pfn++, i++)
+ src_pfns[i] = migrate_device_pfn_lock(pfn);
- folio = folio_get_nontail_page(pfn_to_page(pfn));
- if (!folio) {
- src_pfns[i] = 0;
- continue;
- }
+ migrate_device_unmap(src_pfns, npages, NULL);
- if (!folio_trylock(folio)) {
- src_pfns[i] = 0;
- folio_put(folio);
- continue;
- }
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
- src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- }
+/**
+ * migrate_device_pfns() - migrate device private pfns to normal memory.
+ * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * array of device pages to migrate.
+ */
+int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; i++)
+ src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
migrate_device_unmap(src_pfns, npages, NULL);
return 0;
}
-EXPORT_SYMBOL(migrate_device_range);
+EXPORT_SYMBOL(migrate_device_pfns);
/*
* Migrate a device coherent folio back to normal memory. The caller should have
diff --git a/mm/mincore.c b/mm/mincore.c
index d6bd19e520fc..10dabefc3acc 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include "swap.h"
+#include "internal.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -28,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
#ifdef CONFIG_HUGETLB_PAGE
unsigned char present;
unsigned char *vec = walk->private;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
@@ -37,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
+ spin_unlock(ptl);
#else
BUG();
#endif
@@ -105,6 +109,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+ int step, i;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -118,16 +123,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ for (; addr != end; ptep += step, addr += step * PAGE_SIZE) {
pte_t pte = ptep_get(ptep);
+ step = 1;
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
- else if (pte_present(pte))
- *vec = 1;
- else { /* pte is a swap entry */
+ else if (pte_present(pte)) {
+ unsigned int batch = pte_batch_hint(ptep, pte);
+
+ if (batch > 1) {
+ unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+ step = min_t(unsigned int, batch, max_nr);
+ }
+
+ for (i = 0; i < step; i++)
+ vec[i] = 1;
+ } else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
@@ -146,7 +161,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
#endif
}
}
- vec++;
+ vec += step;
}
pte_unmap_unlock(ptep - 1, ptl);
out:
@@ -239,7 +254,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
start = untagged_addr(start);
/* Check the start address: needs to be page-aligned.. */
- if (start & ~PAGE_MASK)
+ if (unlikely(start & ~PAGE_MASK))
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
diff --git a/mm/mlock.c b/mm/mlock.c
index cde076fa7d5e..bb0776f5ef7c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio)
*/
folio_get(folio);
if (!folio_batch_add(fbatch, folio) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -307,15 +307,13 @@ void munlock_folio(struct folio *folio)
static inline unsigned int folio_mlock_step(struct folio *folio,
pte_t *pte, unsigned long addr, unsigned long end)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
unsigned int count = (end - addr) >> PAGE_SHIFT;
pte_t ptent = ptep_get(pte);
if (!folio_test_large(folio))
return 1;
- return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL,
- NULL, NULL);
+ return folio_pte_batch(folio, pte, ptent, count);
}
static inline bool allow_mlock_munlock(struct folio *folio,
@@ -368,6 +366,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (is_huge_zero_pmd(*pmd))
goto out;
folio = pmd_folio(*pmd);
+ if (folio_is_zone_device(folio))
+ goto out;
if (vma->vm_flags & VM_LOCKED)
mlock_folio(folio);
else
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 24b68b425afb..5f1c1096a13f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -30,12 +30,29 @@
#include <linux/crash_dump.h>
#include <linux/execmem.h>
#include <linux/vmstat.h>
+#include <linux/kexec_handover.h>
+#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
#include "shuffle.h"
#include <asm/setup.h>
+#ifndef CONFIG_NUMA
+unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
+EXPORT_SYMBOL(mem_map);
+#endif
+
+/*
+ * high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.
+ */
+void *high_memory;
+EXPORT_SYMBOL(high_memory);
+
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
@@ -438,7 +455,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* was requested by the user
*/
required_movablecore =
- roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ round_up(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
@@ -545,11 +562,11 @@ restart:
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
zone_movable_pfn[nid] =
- roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
if (zone_movable_pfn[nid] >= end_pfn)
@@ -649,6 +666,29 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+/*
+ * Initialize a reserved page unconditionally, finding its zone first.
+ */
+void __meminit __init_page_from_nid(unsigned long pfn, int nid)
+{
+ pg_data_t *pgdat;
+ int zid;
+
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_spans_pfn(zone, pfn))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+
+ if (pageblock_aligned(pfn))
+ init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE,
+ false);
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
@@ -705,26 +745,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn, int nid)
+static void __meminit __init_deferred_page(unsigned long pfn, int nid)
{
- pg_data_t *pgdat;
- int zid;
-
if (early_page_initialised(pfn, nid))
return;
- pgdat = NODE_DATA(nid);
-
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
-
- if (zone_spans_pfn(zone, pfn))
- break;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
-
- if (pageblock_aligned(pfn))
- set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+ __init_page_from_nid(pfn, nid);
}
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
@@ -739,11 +765,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn, int nid)
+static inline void __init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void __meminit init_deferred_page(unsigned long pfn, int nid)
+{
+ __init_deferred_page(pfn, nid);
+}
+
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
@@ -753,22 +784,19 @@ static inline void init_reserved_page(unsigned long pfn, int nid)
void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
+ unsigned long pfn;
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
- init_reserved_page(start_pfn, nid);
+ __init_deferred_page(pfn, nid);
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
@@ -804,7 +832,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
- * - non-memory regions covered by the contigious flatmem mapping
+ * - non-memory regions covered by the contiguous flatmem mapping
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
@@ -824,11 +852,7 @@ static void __init init_unavailable_range(unsigned long spfn,
unsigned long pfn;
u64 pgcnt = 0;
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
+ for_each_valid_pfn(pfn, spfn, epfn) {
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
@@ -851,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn,
void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -908,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
* over the place during system boot.
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, migratetype);
+ init_pageblock_migratetype(page, migratetype,
+ isolate_pageblock);
cond_resched();
}
pfn++;
@@ -931,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone,
return;
memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
- zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE,
+ false);
if (*hole_pfn < start_pfn)
init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
@@ -960,19 +987,19 @@ static void __init memmap_init(void)
}
}
-#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
- * section_end].
+ * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
+ * for FLATMEM.
* Append the pages in this hole to the highest zone in the last
* node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
*/
+#ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
+#else
+ end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
#endif
+ if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
@@ -998,7 +1025,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
- page->pgmap = pgmap;
+ page_folio(page)->pgmap = pgmap;
page->zone_device_data = NULL;
/*
@@ -1012,17 +1039,30 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* because this is done early in section_activate()
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
cond_resched();
}
/*
- * ZONE_DEVICE pages are released directly to the driver page allocator
- * which will set the page count to 1 when allocating the page.
+ * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
+ * directly to the driver page allocator which will set the page count
+ * to 1 when allocating the page.
+ *
+ * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
+ * their refcount reset to one whenever they are freed (ie. after
+ * their refcount drops to 0).
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_COHERENT)
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_FS_DAX:
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ case MEMORY_DEVICE_PCI_P2PDMA:
set_page_count(page, 0);
+ break;
+
+ case MEMORY_DEVICE_GENERIC:
+ break;
+ }
}
/*
@@ -1431,7 +1471,7 @@ void __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_SPARSEMEM
/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
* Start by making sure zonesize is a multiple of pageblock_order by rounding
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
* round what is now in bits to nearest long in bits, then return it in
@@ -1442,10 +1482,10 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
unsigned long usemapsize;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = round_up(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, BITS_PER_LONG);
+ usemapsize = round_up(usemapsize, BITS_PER_LONG);
return usemapsize / BITS_PER_BYTE;
}
@@ -1473,7 +1513,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_MAX_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
@@ -1585,13 +1625,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
{
void *ptr;
+ /*
+ * Kmemleak will explicitly scan mem_map by traversing all valid
+ * `struct *page`,so memblock does not need to be added to the scan list.
+ */
if (exact_nid)
ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
else
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
if (ptr && size > 0)
@@ -1613,7 +1657,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/*
- * The zone's endpoints aren't required to be MAX_PAGE_ORDER
+ * The zone's endpoints aren't required to be MAX_PAGE_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
@@ -1629,14 +1673,15 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
__func__, pgdat->node_id, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NUMA
+
/* the global mem_map is just set as node 0's */
- if (pgdat == NODE_DATA(0)) {
- mem_map = NODE_DATA(0)->node_mem_map;
- if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
- }
-#endif
+ WARN_ON(pgdat != NODE_DATA(0));
+
+ mem_map = pgdat->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+
+ max_mapnr = end - start;
}
#else
static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
@@ -1743,6 +1788,27 @@ static bool arch_has_descending_max_zone_pfns(void)
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
+static void __init set_high_memory(void)
+{
+ phys_addr_t highmem = memblock_end_of_DRAM();
+
+ /*
+ * Some architectures (e.g. ARM) set high_memory very early and
+ * use it in arch setup code.
+ * If an architecture already set high_memory don't overwrite it
+ */
+ if (high_memory)
+ return;
+
+#ifdef CONFIG_HIGHMEM
+ if (arch_has_descending_max_zone_pfns() ||
+ highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]))
+ highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]);
+#endif
+
+ high_memory = phys_to_virt(highmem - 1) + 1;
+}
+
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -1844,7 +1910,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
free_area_init_node(nid);
/*
- * No sysfs hierarcy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_one_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
@@ -1857,11 +1923,16 @@ void __init free_area_init(unsigned long *max_zone_pfn)
}
}
+ for_each_node_state(nid, N_MEMORY)
+ sparse_vmemmap_init_nid_late(nid);
+
calc_nr_kernel_pages();
memmap_init();
/* disable hash distribution for systems with a single node */
fixup_hashdist();
+
+ set_high_memory();
}
/**
@@ -1929,7 +2000,8 @@ static void __init deferred_free_pages(unsigned long pfn,
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
- set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
+ false);
__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
return;
}
@@ -1939,7 +2011,8 @@ static void __init deferred_free_pages(unsigned long pfn,
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE,
+ false);
__free_pages_core(page, 0, MEMINIT_EARLY);
}
}
@@ -2238,7 +2311,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_pageblock_migratetype(page, MIGRATE_CMA);
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
set_page_refcounted(page);
/* pages were reserved and not allocated */
clear_page_tag_ref(page);
@@ -2247,6 +2320,15 @@ void __init init_cma_reserved_pageblock(struct page *page)
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
+/*
+ * Similar to above, but only set the migrate type and stats.
+ */
+void __init init_cma_pageblock(struct page *page)
+{
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
+ adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
+}
#endif
void set_zone_contiguous(struct zone *zone)
@@ -2271,6 +2353,31 @@ void set_zone_contiguous(struct zone *zone)
zone->contiguous = true;
}
+/*
+ * Check if a PFN range intersects multiple zones on one or more
+ * NUMA nodes. Specify the @nid argument if it is known that this
+ * PFN range is on one node, NUMA_NO_NODE otherwise.
+ */
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct zone *zone, *izone = NULL;
+
+ for_each_zone(zone) {
+ if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid)
+ continue;
+
+ if (zone_intersects(zone, start_pfn, nr_pages)) {
+ if (izone != NULL)
+ return true;
+ izone = zone;
+ }
+
+ }
+
+ return false;
+}
+
static void __init mem_init_print_info(void);
void __init page_alloc_init_late(void)
{
@@ -2430,7 +2537,7 @@ void *__init alloc_large_system_hash(const char *tablename,
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ tablename, 1UL << log2qty, get_order(size), size,
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
@@ -2565,12 +2672,6 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
@@ -2632,11 +2733,22 @@ static void __init mem_init_print_info(void)
);
}
+void __init __weak arch_mm_preinit(void)
+{
+}
+
+void __init __weak mem_init(void)
+{
+}
+
/*
* Set up kernel memory allocators
*/
void __init mm_core_init(void)
{
+ arch_mm_preinit();
+ hugetlb_bootmem_alloc();
+
/* Initializations relying on SMP setup */
BUILD_BUG_ON(MAX_ZONELISTS > 2);
build_all_zonelists(NULL);
@@ -2652,6 +2764,14 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+
+ /*
+ * KHO memory setup must happen while memblock is still active, but
+ * as close as possible to buddy initialization
+ */
+ kho_memory_init();
+
+ memblock_free_all();
mem_init();
kmem_cache_init();
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index aec208f90337..7306253cc3b5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgprot_t vm_page_prot;
vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
@@ -111,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
return mlock_future_ok(current->mm, current->mm->def_flags, len)
? 0 : -EAGAIN;
}
-static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
- unsigned long addr, unsigned long request, unsigned long flags);
+
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
@@ -128,18 +127,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
origbrk = mm->brk;
+ min_brk = mm->start_brk;
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
- if (current->brk_randomized)
- min_brk = mm->start_brk;
- else
+ if (!current->brk_randomized)
min_brk = mm->end_data;
-#else
- min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;
@@ -229,12 +225,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
- if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
return true;
locked_pages = bytes >> PAGE_SHIFT;
@@ -278,8 +274,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
return true;
}
-/*
+/**
+ * do_mmap() - Perform a userland memory mapping into the current process
+ * address space of length @len with protection bits @prot, mmap flags @flags
+ * (from which VMA flags will be inferred), and any additional VMA flags to
+ * apply @vm_flags. If this is a file-backed mapping then the file is specified
+ * in @file and page offset into the file via @pgoff.
+ *
+ * This function does not perform security checks on the file and assumes, if
+ * @uf is non-NULL, the caller has provided a list head to track unmap events
+ * for userfaultfd @uf.
+ *
+ * It also simply indicates whether memory population is required by setting
+ * @populate, which must be non-NULL, expecting the caller to actually perform
+ * this task itself if appropriate.
+ *
+ * This function will invoke architecture-specific (and if provided and
+ * relevant, file system-specific) logic to determine the most appropriate
+ * unmapped area in which to place the mapping if not MAP_FIXED.
+ *
+ * Callers which require userland mmap() behaviour should invoke vm_mmap(),
+ * which is also exported for module use.
+ *
+ * Those which require this behaviour less security checks, userfaultfd and
+ * populate behaviour, and who handle the mmap write lock themselves, should
+ * call this function.
+ *
+ * Note that the returned address may reside within a merged VMA if an
+ * appropriate merge were to take place, so it doesn't necessarily specify the
+ * start of a VMA, rather only the start of a valid mapped range of length
+ * @len bytes, rounded down to the nearest page size.
+ *
* The caller must write-lock current->mm->mmap_lock.
+ *
+ * @file: An optional struct file pointer describing the file which is to be
+ * mapped, if a file-backed mapping.
+ * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the
+ * address at which to perform this mapping. See mmap (2) for details. Must be
+ * page-aligned.
+ * @len: The length of the mapping. Will be page-aligned and must be at least 1
+ * page in size.
+ * @prot: Protection bits describing access required to the mapping. See mmap
+ * (2) for details.
+ * @flags: Flags specifying how the mapping should be performed, see mmap (2)
+ * for details.
+ * @vm_flags: VMA flags which should be set by default, or 0 otherwise.
+ * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise.
+ * @populate: A pointer to a value which will be set to 0 if no population of
+ * the range is required, or the number of bytes to populate if it is. Must be
+ * non-NULL. See mmap (2) for details as to under what circumstances population
+ * of the range occurs.
+ * @uf: An optional pointer to a list head to track userfaultfd unmap events
+ * should unmapping events arise. If provided, it is up to the caller to manage
+ * this.
+ *
+ * Returns: Either an error, or the address at which the requested mapping has
+ * been performed.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
@@ -292,6 +342,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
*populate = 0;
+ mmap_assert_write_locked(mm);
+
if (!len)
return -EINVAL;
@@ -369,8 +421,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
- unsigned int seals = memfd_file_seals(file);
unsigned long flags_mask;
+ int err;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
@@ -410,8 +462,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
- else if (is_readonly_sealed(seals, vm_flags))
- vm_flags &= ~VM_MAYWRITE;
fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
@@ -422,7 +472,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
@@ -431,6 +481,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
default:
return -EINVAL;
}
+
+ /*
+ * Check to see if we are violating any seals and update VMA
+ * flags if necessary to avoid future seal violations.
+ */
+ err = memfd_check_seals_mmap(file, &vm_flags);
+ if (err)
+ return (unsigned long)err;
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
@@ -581,115 +639,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-/**
- * unmapped_area() - Find an area between the low_limit and the high_limit with
- * the correct alignment and offset, all from @info. Note: current->mm is used
- * for the search.
- *
- * @info: The unmapped area information including the range [low_limit -
- * high_limit), the alignment offset and mask.
- *
- * Return: A memory address or -ENOMEM.
- */
-static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
-{
- unsigned long length, gap;
- unsigned long low_limit, high_limit;
- struct vm_area_struct *tmp;
- VMA_ITERATOR(vmi, current->mm, 0);
-
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask + info->start_gap;
- if (length < info->length)
- return -ENOMEM;
-
- low_limit = info->low_limit;
- if (low_limit < mmap_min_addr)
- low_limit = mmap_min_addr;
- high_limit = info->high_limit;
-retry:
- if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
- return -ENOMEM;
-
- /*
- * Adjust for the gap first so it doesn't interfere with the
- * later alignment. The first step is the minimum needed to
- * fulill the start gap, the next steps is the minimum to align
- * that. It is the minimum needed to fulill both.
- */
- gap = vma_iter_addr(&vmi) + info->start_gap;
- gap += (info->align_offset - gap) & info->align_mask;
- tmp = vma_next(&vmi);
- if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) < gap + length - 1) {
- low_limit = tmp->vm_end;
- vma_iter_reset(&vmi);
- goto retry;
- }
- } else {
- tmp = vma_prev(&vmi);
- if (tmp && vm_end_gap(tmp) > gap) {
- low_limit = vm_end_gap(tmp);
- vma_iter_reset(&vmi);
- goto retry;
- }
- }
-
- return gap;
-}
-
-/**
- * unmapped_area_topdown() - Find an area between the low_limit and the
- * high_limit with the correct alignment and offset at the highest available
- * address, all from @info. Note: current->mm is used for the search.
- *
- * @info: The unmapped area information including the range [low_limit -
- * high_limit), the alignment offset and mask.
- *
- * Return: A memory address or -ENOMEM.
- */
-static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
-{
- unsigned long length, gap, gap_end;
- unsigned long low_limit, high_limit;
- struct vm_area_struct *tmp;
- VMA_ITERATOR(vmi, current->mm, 0);
-
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask + info->start_gap;
- if (length < info->length)
- return -ENOMEM;
-
- low_limit = info->low_limit;
- if (low_limit < mmap_min_addr)
- low_limit = mmap_min_addr;
- high_limit = info->high_limit;
-retry:
- if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
- return -ENOMEM;
-
- gap = vma_iter_end(&vmi) - info->length;
- gap -= (gap - info->align_offset) & info->align_mask;
- gap_end = vma_iter_end(&vmi);
- tmp = vma_next(&vmi);
- if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) < gap_end) {
- high_limit = vm_start_gap(tmp);
- vma_iter_reset(&vmi);
- goto retry;
- }
- } else {
- tmp = vma_prev(&vmi);
- if (tmp && vm_end_gap(tmp) > gap) {
- high_limit = tmp->vm_start;
- vma_iter_reset(&vmi);
- goto retry;
- }
- }
-
- return gap;
-}
-
/*
* Determine if the allocation needs to ensure that there is no
* existing mapping within it's guard gaps, for use as start_gap.
@@ -919,9 +868,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
- return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0);
- return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
+ return mm_get_unmapped_area_vmflags(mm, file, addr, len,
+ pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);
@@ -989,211 +937,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
return vma;
}
-/*
- * Verify that the stack growth is acceptable and
- * update accounting. This is shared with both the
- * grow-up and grow-down cases.
- */
-static int acct_stack_growth(struct vm_area_struct *vma,
- unsigned long size, unsigned long grow)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long new_start;
-
- /* address space limit tests */
- if (!may_expand_vm(mm, vma->vm_flags, grow))
- return -ENOMEM;
-
- /* Stack limit test */
- if (size > rlimit(RLIMIT_STACK))
- return -ENOMEM;
-
- /* mlock limit tests */
- if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
- return -ENOMEM;
-
- /* Check to ensure the stack will not grow into a hugetlb-only region */
- new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
- vma->vm_end - size;
- if (is_hugepage_only_range(vma->vm_mm, new_start, size))
- return -EFAULT;
-
- /*
- * Overcommit.. This must be the final test, as it will
- * update security statistics.
- */
- if (security_vm_enough_memory_mm(mm, grow))
- return -ENOMEM;
-
- return 0;
-}
-
-#if defined(CONFIG_STACK_GROWSUP)
-/*
- * PA-RISC uses this for its stack.
- * vma is the last one with address > vma->vm_end. Have to extend vma.
- */
-static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next;
- unsigned long gap_addr;
- int error = 0;
- VMA_ITERATOR(vmi, mm, vma->vm_start);
-
- if (!(vma->vm_flags & VM_GROWSUP))
- return -EFAULT;
-
- mmap_assert_write_locked(mm);
-
- /* Guard against exceeding limits of the address space. */
- address &= PAGE_MASK;
- if (address >= (TASK_SIZE & PAGE_MASK))
- return -ENOMEM;
- address += PAGE_SIZE;
-
- /* Enforce stack_guard_gap */
- gap_addr = address + stack_guard_gap;
-
- /* Guard against overflow */
- if (gap_addr < address || gap_addr > TASK_SIZE)
- gap_addr = TASK_SIZE;
-
- next = find_vma_intersection(mm, vma->vm_end, gap_addr);
- if (next && vma_is_accessible(next)) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- /* Check that both stack segments have the same anon_vma? */
- }
-
- if (next)
- vma_iter_prev_range_limit(&vmi, address);
-
- vma_iter_config(&vmi, vma->vm_start, address);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- /* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma))) {
- vma_iter_free(&vmi);
- return -ENOMEM;
- }
-
- /* Lock the VMA before expanding to prevent concurrent page faults */
- vma_start_write(vma);
- /* We update the anon VMA tree. */
- anon_vma_lock_write(vma->anon_vma);
-
- /* Somebody else might have raced and expanded it already */
- if (address > vma->vm_end) {
- unsigned long size, grow;
-
- size = address - vma->vm_start;
- grow = (address - vma->vm_end) >> PAGE_SHIFT;
-
- error = -ENOMEM;
- if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, grow);
- anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_end = address;
- /* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
- anon_vma_interval_tree_post_update_vma(vma);
-
- perf_event_mmap(vma);
- }
- }
- }
- anon_vma_unlock_write(vma->anon_vma);
- vma_iter_free(&vmi);
- validate_mm(mm);
- return error;
-}
-#endif /* CONFIG_STACK_GROWSUP */
-
-/*
- * vma is the first one with address < vma->vm_start. Have to extend vma.
- * mmap_lock held for writing.
- */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *prev;
- int error = 0;
- VMA_ITERATOR(vmi, mm, vma->vm_start);
-
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return -EFAULT;
-
- mmap_assert_write_locked(mm);
-
- address &= PAGE_MASK;
- if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
- return -EPERM;
-
- /* Enforce stack_guard_gap */
- prev = vma_prev(&vmi);
- /* Check that both stack segments have the same anon_vma? */
- if (prev) {
- if (!(prev->vm_flags & VM_GROWSDOWN) &&
- vma_is_accessible(prev) &&
- (address - prev->vm_end < stack_guard_gap))
- return -ENOMEM;
- }
-
- if (prev)
- vma_iter_next_range_limit(&vmi, vma->vm_start);
-
- vma_iter_config(&vmi, address, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- /* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma))) {
- vma_iter_free(&vmi);
- return -ENOMEM;
- }
-
- /* Lock the VMA before expanding to prevent concurrent page faults */
- vma_start_write(vma);
- /* We update the anon VMA tree. */
- anon_vma_lock_write(vma->anon_vma);
-
- /* Somebody else might have raced and expanded it already */
- if (address < vma->vm_start) {
- unsigned long size, grow;
-
- size = vma->vm_end - address;
- grow = (vma->vm_start - address) >> PAGE_SHIFT;
-
- error = -ENOMEM;
- if (grow <= vma->vm_pgoff) {
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, grow);
- anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
- /* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
- anon_vma_interval_tree_post_update_vma(vma);
-
- perf_event_mmap(vma);
- }
- }
- }
- anon_vma_unlock_write(vma->anon_vma);
- vma_iter_free(&vmi);
- validate_mm(mm);
- return error;
-}
-
/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
@@ -1325,58 +1068,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- unsigned long ret;
- bool writable_file_mapping = false;
-
- /* Check to see if MDWE is applicable. */
- if (map_deny_write_exec(vm_flags, vm_flags))
- return -EACCES;
-
- /* Allow architectures to sanity-check the vm_flags. */
- if (!arch_validate_flags(vm_flags))
- return -EINVAL;
-
- /* Map writable and ensure this isn't a sealed memfd. */
- if (file && is_shared_maywrite(vm_flags)) {
- int error = mapping_map_writable(file->f_mapping);
-
- if (error)
- return error;
- writable_file_mapping = true;
- }
-
- ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
-
- /* Clear our write mapping regardless of error. */
- if (writable_file_mapping)
- mapping_unmap_writable(file->f_mapping);
-
- validate_mm(current->mm);
- return ret;
-}
-
-static int __vm_munmap(unsigned long start, size_t len, bool unlock)
-{
- int ret;
- struct mm_struct *mm = current->mm;
- LIST_HEAD(uf);
- VMA_ITERATOR(vmi, mm, start);
-
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
- if (ret || !unlock)
- mmap_write_unlock(mm);
-
- userfaultfd_unmap_complete(mm, &uf);
- return ret;
-}
-
int vm_munmap(unsigned long start, size_t len)
{
return __vm_munmap(start, len, false);
@@ -1512,89 +1203,7 @@ out:
return ret;
}
-/*
- * do_brk_flags() - Increase the brk vma if the flags match.
- * @vmi: The vma iterator
- * @addr: The start address
- * @len: The length of the increase
- * @vma: The vma,
- * @flags: The VMA Flags
- *
- * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
- * do not match then create a new anonymous VMA. Eventually we may be able to
- * do some brk-specific accounting here.
- */
-static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, unsigned long len, unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
-
- /*
- * Check against address space limits by the changed size
- * Note: This happens *after* clearing old mappings in some code paths.
- */
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
-
- if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- /*
- * Expand the existing vma if possible; Note that singular lists do not
- * occur after forking, so the expand will only happen on new VMAs.
- */
- if (vma && vma->vm_end == addr) {
- VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
-
- vmg.prev = vma;
- /* vmi is positioned at prev, which this mode expects. */
- vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
-
- if (vma_merge_new_range(&vmg))
- goto out;
- else if (vmg_nomem(&vmg))
- goto unacct_fail;
- }
-
- if (vma)
- vma_iter_next_range(vmi);
- /* create a vma struct for an anonymous mapping */
- vma = vm_area_alloc(mm);
- if (!vma)
- goto unacct_fail;
-
- vma_set_anonymous(vma);
- vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
- vm_flags_init(vma, flags);
- vma->vm_page_prot = vm_get_page_prot(flags);
- vma_start_write(vma);
- if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
- goto mas_store_fail;
-
- mm->map_count++;
- validate_mm(mm);
- ksm_add_vma(vma);
-out:
- perf_event_mmap(vma);
- mm->total_vm += len >> PAGE_SHIFT;
- mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
- vm_flags_set(vma, VM_SOFTDIRTY);
- return 0;
-
-mas_store_fail:
- vm_area_free(vma);
-unacct_fail:
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
-}
-
-int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
@@ -1611,7 +1220,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
return 0;
/* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
+ if ((vm_flags & (~VM_EXEC)) != 0)
return -EINVAL;
if (mmap_write_lock_killable(mm))
@@ -1626,7 +1235,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
goto munmap_failed;
vma = vma_prev(&vmi);
- ret = do_brk_flags(&vmi, vma, addr, len, flags);
+ ret = do_brk_flags(&vmi, vma, addr, len, vm_flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
@@ -1664,7 +1273,6 @@ void exit_mmap(struct mm_struct *mm)
goto destroy;
}
- lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
@@ -1693,7 +1301,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, /* unreachable = */ true);
+ vma_mark_detached(vma);
+ remove_vma(vma);
count++;
cond_resched();
vma = vma_next(&vmi);
@@ -1708,48 +1317,6 @@ destroy:
vm_unacct_memory(nr_accounted);
}
-/* Insert vm structure into process list sorted by address
- * and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_rwsem is taken here.
- */
-int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- unsigned long charged = vma_pages(vma);
-
-
- if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
- return -ENOMEM;
-
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
-
- /*
- * The vm_pgoff of a purely anonymous vma should be irrelevant
- * until its first write fault, when page's anon_vma and index
- * are set. But now set the vm_pgoff it will almost certainly
- * end up with (unless mremap moves it elsewhere before that
- * first wfault), so /proc/pid/maps tells a consistent story.
- *
- * By setting it to reflect the virtual start address of the
- * vma, merges and splits can happen in a seamless way, just
- * using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk_flags.
- */
- if (vma_is_anonymous(vma)) {
- BUG_ON(vma->anon_vma);
- vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
- }
-
- if (vma_link(mm, vma)) {
- if (vma->vm_flags & VM_ACCOUNT)
- vm_unacct_memory(charged);
- return -ENOMEM;
- }
-
- return 0;
-}
-
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
@@ -1873,7 +1440,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
static struct vm_area_struct *__install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, void *priv,
+ vm_flags_t vm_flags, void *priv,
const struct vm_operations_struct *ops)
{
int ret;
@@ -1925,14 +1492,65 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
struct vm_area_struct *_install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_special_mapping *spec)
+ vm_flags_t vm_flags, const struct vm_special_mapping *spec)
{
return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
&special_mapping_vmops);
}
+#ifdef CONFIG_SYSCTL
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+static const struct ctl_table mmap_table[] = {
+ {
+ .procname = "max_map_count",
+ .data = &sysctl_max_map_count,
+ .maxlen = sizeof(sysctl_max_map_count),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+ {
+ .procname = "legacy_va_layout",
+ .data = &sysctl_legacy_va_layout,
+ .maxlen = sizeof(sysctl_legacy_va_layout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
+};
+#endif /* CONFIG_SYSCTL */
+
/*
- * initialise the percpu counter for VM
+ * initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@@ -1940,6 +1558,10 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", mmap_table);
+#endif
+ vma_state_init();
}
/*
@@ -2051,84 +1673,217 @@ static int __meminit init_reserve_notifier(void)
subsys_initcall(init_reserve_notifier);
/*
- * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
- * this VMA and its relocated range, which will now reside at [vma->vm_start -
- * shift, vma->vm_end - shift).
+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
+ * start of the VMA, the intent is to perform a write, and it is a
+ * downward-growing stack, then attempt to expand the stack to contain it.
+ *
+ * This function is intended only for obtaining an argument page from an ELF
+ * image, and is almost certainly NOT what you want to use for any other
+ * purpose.
+ *
+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
+ * new VMA being mapped.
+ *
+ * The function assumes that addr is either contained within the VMA or below
+ * it, and makes no attempt to validate this value beyond that.
*
- * This function is almost certainly NOT what you want for anything other than
- * early executable temporary stack relocation.
+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
+ * false if the stack expansion failed.
+ *
+ * On stack expansion the function temporarily acquires an mmap write lock
+ * before downgrading it.
*/
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
+ struct vm_area_struct *new_vma,
+ unsigned long addr, bool write)
{
- /*
- * The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
+ if (!write || addr >= new_vma->vm_start) {
+ mmap_read_lock(mm);
+ return true;
+ }
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
+ if (!(new_vma->vm_flags & VM_GROWSDOWN))
+ return false;
- BUG_ON(new_start > new_end);
+ mmap_write_lock(mm);
+ if (expand_downwards(new_vma, addr)) {
+ mmap_write_unlock(mm);
+ return false;
+ }
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
+ mmap_write_downgrade(mm);
+ return true;
+}
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- vmg.vma = vma;
- if (vma_expand(&vmg))
- return -ENOMEM;
+__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct vm_area_struct *mpnt, *tmp;
+ int retval;
+ unsigned long charge = 0;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, 0);
+ if (mmap_write_lock_killable(oldmm))
+ return -EINTR;
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
/*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
+ * Not linked in yet - no deadlock potential:
*/
- if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false, true))
- return -ENOMEM;
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
+
+ /* No ordering required: file already has been exposed. */
+ dup_mm_exe_file(mm, oldmm);
+
+ mm->total_vm = oldmm->total_vm;
+ mm->data_vm = oldmm->data_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
+ goto out;
+
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(vmi, mpnt) {
+ struct file *file;
+
+ vma_start_write(mpnt);
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ /*
+ * Don't duplicate many vmas if we've been oom-killed (for
+ * example)
+ */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto loop_out;
+ }
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+
+ tmp = vm_area_dup(mpnt);
+ if (!tmp)
+ goto fail_nomem;
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /*
+ * VM_WIPEONFORK gets a clean slate in the child.
+ * Don't prepare anon_vma until fault since we don't
+ * copy page for current vma.
+ */
+ tmp->anon_vma = NULL;
+ } else if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
+ /*
+ * Copy/update hugetlb private vma information.
+ */
+ if (is_vm_hugetlb_page(tmp))
+ hugetlb_dup_vma_private(tmp);
- lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
/*
- * when the old and new regions overlap clear from new_end.
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
*/
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ vma_iter_bulk_store(&vmi, tmp);
+
+ mm->map_count++;
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ file = tmp->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ i_mmap_lock_write(mapping);
+ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(tmp, mpnt);
+
+ if (retval) {
+ mpnt = vma_next(&vmi);
+ goto loop_out;
+ }
+ }
+ /* a new mm has just been created */
+ retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval) {
+ mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
} else {
+
/*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
*/
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ if (mpnt) {
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ /* Avoid OOM iterating a broken tree */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ }
+ /*
+ * The mm_struct is going to exit, but the locks will be dropped
+ * first. Set the mm_struct as unstable is advisable as it is
+ * not fully initialised.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
}
- tlb_finish_mmu(&tlb);
+out:
+ mmap_write_unlock(mm);
+ flush_tlb_mm(oldmm);
+ mmap_write_unlock(oldmm);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
+ return retval;
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ vm_area_free(tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto loop_out;
}
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index f186d57df2c6..b006cec8e6fe 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
-#ifdef CONFIG_MEMCG
-
-/*
- * Size of the buffer for memcg path names. Ignoring stack trace support,
- * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
- */
-#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- if (trace_mmap_lock_##type##_enabled()) { \
- char buf[MEMCG_PATH_BUF_SIZE]; \
- get_mm_memcg_path(mm, buf, sizeof(buf)); \
- trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
- } \
- } while (0)
-
-#else /* !CONFIG_MEMCG */
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
-#endif /* CONFIG_MEMCG */
-
#ifdef CONFIG_TRACING
-#ifdef CONFIG_MEMCG
-/*
- * Write the given mm_struct's memcg path to a buffer. If the path cannot be
- * determined, empty string is written.
- */
-static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
-{
- struct mem_cgroup *memcg;
-
- buf[0] = '\0';
- memcg = get_mem_cgroup_from_mm(mm);
- if (memcg == NULL)
- return;
- if (memcg->css.cgroup)
- cgroup_path(memcg->css.cgroup, buf, buflen);
- css_put(&memcg->css);
-}
-
-#endif /* CONFIG_MEMCG */
-
/*
* Trace calls must be in a separate file, as otherwise there's a circular
* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
@@ -69,20 +25,382 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
- TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+ trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
bool success)
{
- TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+ trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
- TRACE_MMAP_LOCK_EVENT(released, mm, write);
+ trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ /* Failed to lock the VMA */
+ goto inval;
+ }
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
+
+ rcu_read_unlock();
+ return vma;
+
+inval_end_read:
+ vma_end_read(vma);
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+
+static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
+ struct vma_iterator *vmi,
+ unsigned long from_addr)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Lookup the vma at the last position again under mmap_read_lock */
+ vma_iter_set(vmi, from_addr);
+ vma = vma_next(vmi);
+ if (vma) {
+ /* Very unlikely vma->vm_refcnt overflow case */
+ if (unlikely(!vma_start_read_locked(vma)))
+ vma = ERR_PTR(-EAGAIN);
+ }
+
+ mmap_read_unlock(mm);
+
+ return vma;
+}
+
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+ struct vma_iterator *vmi,
+ unsigned long from_addr)
+{
+ struct vm_area_struct *vma;
+ unsigned int mm_wr_seq;
+ bool mmap_unlocked;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
+retry:
+ /* Start mmap_lock speculation in case we need to verify the vma later */
+ mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
+ vma = vma_next(vmi);
+ if (!vma)
+ return NULL;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /*
+ * Retry immediately if the vma gets detached from under us.
+ * Infinite loop should not happen because the vma we find will
+ * have to be constantly knocked out from under us.
+ */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* reset to search from the last address */
+ vma_iter_set(vmi, from_addr);
+ goto retry;
+ }
+
+ goto fallback;
+ }
+
+ /* Verify the vma is not behind the last search position. */
+ if (unlikely(from_addr >= vma->vm_end))
+ goto fallback_unlock;
+
+ /*
+ * vma can be ahead of the last search position but we need to verify
+ * it was not shrunk after we found it and another vma has not been
+ * installed ahead of it. Otherwise we might observe a gap that should
+ * not be there.
+ */
+ if (from_addr < vma->vm_start) {
+ /* Verify only if the address space might have changed since vma lookup. */
+ if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
+ vma_iter_set(vmi, from_addr);
+ if (vma != vma_next(vmi))
+ goto fallback_unlock;
+ }
+ }
+
+ return vma;
+
+fallback_unlock:
+ vma_end_read(vma);
+fallback:
+ rcu_read_unlock();
+ vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
+ rcu_read_lock();
+ /* Reinitialize the iterator after re-entering rcu read section */
+ vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
+
+ return vma;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
+
+#else /* CONFIG_MMU */
+
+/*
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+#endif /* CONFIG_MMU */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 99b3e9408aa0..b49cc6385f1f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -246,8 +246,16 @@ static void __tlb_remove_table_free(struct mmu_table_batch *batch)
* IRQs delays the completion of the TLB flush we can never observe an already
* freed page.
*
- * Architectures that do not have this (PPC) need to delay the freeing by some
- * other means, this is that means.
+ * Not all systems IPI every CPU for this purpose:
+ *
+ * - Some architectures have HW support for cross-CPU synchronisation of TLB
+ * flushes, so there's no IPI at all.
+ *
+ * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
+ * with the hypervisor to defer flushing on preempted vCPUs.
+ *
+ * Such systems need to delay the freeing by some other means, this is that
+ * means.
*
* What we do is batch the freed directory pages (tables) and RCU free them.
* We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
@@ -311,11 +319,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb)
}
}
-static void tlb_remove_table_one(void *table)
+#ifdef CONFIG_PT_RECLAIM
+static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ __tlb_remove_table(ptdesc);
+}
+
+static inline void __tlb_remove_table_one(void *table)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = table;
+ call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
+}
+#else
+static inline void __tlb_remove_table_one(void *table)
{
tlb_remove_table_sync_one();
__tlb_remove_table(table);
}
+#endif /* CONFIG_PT_RECLAIM */
+
+static void tlb_remove_table_one(void *table)
+{
+ __tlb_remove_table_one(table);
+}
static void tlb_table_flush(struct mmu_gather *tlb)
{
@@ -393,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
+ tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index fc18fe274505..8e0125dc0522 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <cl@linux.com>
+ * Christoph Lameter <cl@gentwo.org>
*/
#include <linux/rculist.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 516b1d847e2c..113b48985834 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -40,11 +40,8 @@
#include "internal.h"
-bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pte)
{
- struct page *page;
-
if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
return false;
@@ -60,16 +57,32 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
if (userfaultfd_pte_wp(vma, pte))
return false;
- if (!(vma->vm_flags & VM_SHARED)) {
- /*
- * Writable MAP_PRIVATE mapping: We can only special-case on
- * exclusive anonymous pages, because we know that our
- * write-fault handler similarly would map them writable without
- * any additional checks while holding the PT lock.
- */
- page = vm_normal_page(vma, addr, pte);
- return page && PageAnon(page) && PageAnonExclusive(page);
- }
+ return true;
+}
+
+static bool can_change_private_pte_writable(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ if (!maybe_change_pte_writable(vma, pte))
+ return false;
+
+ /*
+ * Writable MAP_PRIVATE mapping: We can only special-case on
+ * exclusive anonymous pages, because we know that our
+ * write-fault handler similarly would map them writable without
+ * any additional checks while holding the PT lock.
+ */
+ page = vm_normal_page(vma, addr, pte);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
+static bool can_change_shared_pte_writable(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ if (!maybe_change_pte_writable(vma, pte))
+ return false;
VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
@@ -83,6 +96,179 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return pte_dirty(pte);
}
+bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ if (!(vma->vm_flags & VM_SHARED))
+ return can_change_private_pte_writable(vma, addr, pte);
+
+ return can_change_shared_pte_writable(vma, pte);
+}
+
+static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
+ pte_t pte, int max_nr_ptes, fpb_t flags)
+{
+ /* No underlying folio, so cannot batch */
+ if (!folio)
+ return 1;
+
+ if (!folio_test_large(folio))
+ return 1;
+
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags);
+}
+
+static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
+ pte_t oldpte, pte_t *pte, int target_node,
+ struct folio *folio)
+{
+ bool ret = true;
+ bool toptier;
+ int nid;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ goto skip;
+
+ if (!folio)
+ goto skip;
+
+ if (folio_is_zone_device(folio) || folio_test_ksm(folio))
+ goto skip;
+
+ /* Also skip shared copy-on-write pages */
+ if (is_cow_mapping(vma->vm_flags) &&
+ (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
+ goto skip;
+
+ /*
+ * While migration can move some dirty pages,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (folio_is_file_lru(folio) && folio_test_dirty(folio))
+ goto skip;
+
+ /*
+ * Don't mess with PTEs if page is already on the node
+ * a single-threaded process is running on.
+ */
+ nid = folio_nid(folio);
+ if (target_node == nid)
+ goto skip;
+
+ toptier = node_is_toptier(nid);
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
+ goto skip;
+
+ ret = false;
+ if (folio_use_access_time(folio))
+ folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+skip:
+ return ret;
+}
+
+/* Set nr_ptes number of ptes, starting from idx */
+static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
+ int idx, bool set_write, struct mmu_gather *tlb)
+{
+ /*
+ * Advance the position in the batch by idx; note that if idx > 0,
+ * then the nr_ptes passed here is <= batch size - idx.
+ */
+ addr += idx * PAGE_SIZE;
+ ptep += idx;
+ oldpte = pte_advance_pfn(oldpte, idx);
+ ptent = pte_advance_pfn(ptent, idx);
+
+ if (set_write)
+ ptent = pte_mkwrite(ptent, vma);
+
+ modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes);
+ if (pte_needs_flush(oldpte, ptent))
+ tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE);
+}
+
+/*
+ * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or
+ * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
+ * that the ptes point to consecutive pages of the same anon large folio.
+ */
+static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
+ struct page *first_page, bool expected_anon_exclusive)
+{
+ int idx;
+
+ for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) {
+ if (expected_anon_exclusive != PageAnonExclusive(first_page + idx))
+ break;
+ }
+ return idx - start_idx;
+}
+
+/*
+ * This function is a result of trying our very best to retain the
+ * "avoid the write-fault handler" optimization. In can_change_pte_writable(),
+ * if the vma is a private vma, and we cannot determine whether to change
+ * the pte to writable just from the vma and the pte, we then need to look
+ * at the actual page pointed to by the pte. Unfortunately, if we have a
+ * batch of ptes pointing to consecutive pages of the same anon large folio,
+ * the anon-exclusivity (or the negation) of the first page does not guarantee
+ * the anon-exclusivity (or the negation) of the other pages corresponding to
+ * the pte batch; hence in this case it is incorrect to decide to change or
+ * not change the ptes to writable just by using information from the first
+ * pte of the batch. Therefore, we must individually check all pages and
+ * retrieve sub-batches.
+ */
+static void commit_anon_folio_batch(struct vm_area_struct *vma,
+ struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
+ pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
+{
+ bool expected_anon_exclusive;
+ int sub_batch_idx = 0;
+ int len;
+
+ while (nr_ptes) {
+ expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx);
+ len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes,
+ first_page, expected_anon_exclusive);
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len,
+ sub_batch_idx, expected_anon_exclusive, tlb);
+ sub_batch_idx += len;
+ nr_ptes -= len;
+ }
+}
+
+static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
+ struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
+ pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
+{
+ bool set_write;
+
+ if (vma->vm_flags & VM_SHARED) {
+ set_write = can_change_shared_pte_writable(vma, ptent);
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
+ /* idx = */ 0, set_write, tlb);
+ return;
+ }
+
+ set_write = maybe_change_pte_writable(vma, ptent) &&
+ (folio && folio_test_anon(folio));
+ if (!set_write) {
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
+ /* idx = */ 0, set_write, tlb);
+ return;
+ }
+ commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb);
+}
+
static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -94,6 +280,7 @@ static long change_pte_range(struct mmu_gather *tlb,
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ int nr_ptes;
tlb_change_page_size(tlb, PAGE_SIZE);
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -108,65 +295,37 @@ static long change_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
+ nr_ptes = 1;
oldpte = ptep_get(pte);
if (pte_present(oldpte)) {
+ const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE;
+ int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
+ struct folio *folio = NULL;
+ struct page *page;
pte_t ptent;
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page)
+ folio = page_folio(page);
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
*/
if (prot_numa) {
- struct folio *folio;
- int nid;
- bool toptier;
-
- /* Avoid TLB flush if possible */
- if (pte_protnone(oldpte))
- continue;
-
- folio = vm_normal_folio(vma, addr, oldpte);
- if (!folio || folio_is_zone_device(folio) ||
- folio_test_ksm(folio))
- continue;
-
- /* Also skip shared copy-on-write pages */
- if (is_cow_mapping(vma->vm_flags) &&
- (folio_maybe_dma_pinned(folio) ||
- folio_likely_mapped_shared(folio)))
- continue;
-
- /*
- * While migration can move some dirty pages,
- * it cannot move them all from MIGRATE_ASYNC
- * context.
- */
- if (folio_is_file_lru(folio) &&
- folio_test_dirty(folio))
- continue;
-
- /*
- * Don't mess with PTEs if page is already on the node
- * a single-threaded process is running on.
- */
- nid = folio_nid(folio);
- if (target_node == nid)
- continue;
- toptier = node_is_toptier(nid);
+ int ret = prot_numa_skip(vma, addr, oldpte, pte,
+ target_node, folio);
+ if (ret) {
- /*
- * Skip scanning top tier node if normal numa
- * balancing is disabled
- */
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- toptier)
+ /* determine batch to skip */
+ nr_ptes = mprotect_folio_pte_batch(folio,
+ pte, oldpte, max_nr_ptes, /* flags = */ 0);
continue;
- if (folio_use_access_time(folio))
- folio_xchg_access_time(folio,
- jiffies_to_msecs(jiffies));
+ }
}
- oldpte = ptep_modify_prot_start(vma, addr, pte);
+ nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
+
+ oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
ptent = pte_modify(oldpte, newprot);
if (uffd_wp)
@@ -188,14 +347,13 @@ static long change_pte_range(struct mmu_gather *tlb,
* COW or special handling is required.
*/
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
- !pte_write(ptent) &&
- can_change_pte_writable(vma, addr, ptent))
- ptent = pte_mkwrite(ptent, vma);
-
- ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
- if (pte_needs_flush(oldpte, ptent))
- tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
- pages++;
+ !pte_write(ptent))
+ set_write_prot_commit_flush_ptes(vma, folio, page,
+ addr, pte, oldpte, ptent, nr_ptes, tlb);
+ else
+ prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
+ nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+ pages += nr_ptes;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
@@ -225,14 +383,6 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_writable_device_exclusive_entry(entry)) {
- entry = make_readable_device_exclusive_entry(
- swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(oldpte))
- newpte = pte_swp_mksoft_dirty(newpte);
- if (pte_swp_uffd_wp(oldpte))
- newpte = pte_swp_mkuffd_wp(newpte);
} else if (is_pte_marker_entry(entry)) {
/*
* Ignore error swap entries unconditionally,
@@ -288,7 +438,7 @@ static long change_pte_range(struct mmu_gather *tlb,
pages++;
}
}
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
@@ -384,10 +534,10 @@ again:
goto next;
_pmd = pmdp_get_lockless(pmd);
- if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
/*
* For file-backed, the pmd could have been
* cleared; make sure pmd populated if
@@ -604,16 +754,16 @@ static const struct mm_walk_ops prot_none_walk_ops = {
int
mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
struct vm_area_struct *vma, struct vm_area_struct **pprev,
- unsigned long start, unsigned long end, unsigned long newflags)
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long oldflags = vma->vm_flags;
+ vm_flags_t oldflags = READ_ONCE(vma->vm_flags);
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
int error;
- if (!can_modify_vma(vma))
+ if (vma_is_sealed(vma))
return -EPERM;
if (newflags == oldflags) {
@@ -627,7 +777,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
* uncommon case, so doesn't need to be very optimized.
*/
if (arch_has_pfn_modify_check() &&
- (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & VM_ACCESS_FLAGS) == 0) {
pgprot_t new_pgprot = vm_get_page_prot(newflags);
@@ -676,7 +826,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
* held in write mode.
*/
vma_start_write(vma);
- vm_flags_reset(vma, newflags);
+ vm_flags_reset_once(vma, newflags);
if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
@@ -782,8 +932,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
nstart = start;
tmp = vma->vm_start;
for_each_vma_range(vmi, vma, end) {
- unsigned long mask_off_old_flags;
- unsigned long newflags;
+ vm_flags_t mask_off_old_flags;
+ vm_flags_t newflags;
int new_vma_pkey;
if (vma->vm_start != tmp) {
diff --git a/mm/mremap.c b/mm/mremap.c
index cff7f552f909..419a0ea0a870 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -32,6 +32,46 @@
#include "internal.h"
+/* Classify the kind of remap operation being performed. */
+enum mremap_type {
+ MREMAP_INVALID, /* Initial state. */
+ MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */
+ MREMAP_SHRINK, /* old_len > new_len. */
+ MREMAP_EXPAND, /* old_len < new_len. */
+};
+
+/*
+ * Describes a VMA mremap() operation and is threaded throughout it.
+ *
+ * Any of the fields may be mutated by the operation, however these values will
+ * always accurately reflect the remap (for instance, we may adjust lengths and
+ * delta to account for hugetlb alignment).
+ */
+struct vma_remap_struct {
+ /* User-provided state. */
+ unsigned long addr; /* User-specified address from which we remap. */
+ unsigned long old_len; /* Length of range being remapped. */
+ unsigned long new_len; /* Desired new length of mapping. */
+ const unsigned long flags; /* user-specified MREMAP_* flags. */
+ unsigned long new_addr; /* Optionally, desired new address. */
+
+ /* uffd state. */
+ struct vm_userfaultfd_ctx *uf;
+ struct list_head *uf_unmap_early;
+ struct list_head *uf_unmap;
+
+ /* VMA state, determined in do_mremap(). */
+ struct vm_area_struct *vma;
+
+ /* Internal state, determined in do_mremap(). */
+ unsigned long delta; /* Absolute delta of old_len,new_len. */
+ bool populate_expand; /* mlock()'d expanded, must populate. */
+ enum mremap_type remap_type; /* expand, shrink, etc. */
+ bool mmap_locked; /* Is mm currently write-locked? */
+ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
+ bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */
+};
+
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
@@ -69,8 +109,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return pmd;
}
-static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -83,13 +122,12 @@ static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
return pud_alloc(mm, p4d, addr);
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
- pud = alloc_new_pud(mm, vma, addr);
+ pud = alloc_new_pud(mm, addr);
if (!pud)
return NULL;
@@ -133,18 +171,42 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- unsigned long old_addr, unsigned long old_end,
- struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr, bool need_rmap_locks)
+static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t pte, int max_nr)
{
+ struct folio *folio;
+
+ if (max_nr == 1)
+ return 1;
+
+ /* Avoid expensive folio lookup if we stand no chance of benefit. */
+ if (pte_batch_hint(ptep, pte) == 1)
+ return 1;
+
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio || !folio_test_large(folio))
+ return 1;
+
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE);
+}
+
+static int move_ptes(struct pagetable_move_control *pmc,
+ unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ struct vm_area_struct *vma = pmc->old;
bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
- pte_t *old_pte, *new_pte, pte;
+ pte_t *old_ptep, *new_ptep;
+ pte_t old_pte, pte;
pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long new_addr = pmc->new_addr;
+ unsigned long old_end = old_addr + extent;
unsigned long len = old_end - old_addr;
+ int max_nr_ptes;
+ int nr_ptes;
int err = 0;
/*
@@ -165,15 +227,15 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
take_rmap_locks(vma);
/*
* We don't have to worry about the ordering of src and dst
* pte locks because exclusive mmap_lock prevents deadlock.
*/
- old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- if (!old_pte) {
+ old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ if (!old_ptep) {
err = -EAGAIN;
goto out;
}
@@ -184,10 +246,10 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* mmap_lock, so this new_pte page is stable, so there is no need to get
* pmdval and do pmd_same() check.
*/
- new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
+ new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
&new_ptl);
- if (!new_pte) {
- pte_unmap_unlock(old_pte, old_ptl);
+ if (!new_ptep) {
+ pte_unmap_unlock(old_ptep, old_ptl);
err = -EAGAIN;
goto out;
}
@@ -196,12 +258,16 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
- for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
- new_pte++, new_addr += PAGE_SIZE) {
- if (pte_none(ptep_get(old_pte)))
+ for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
+ new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
+ VM_WARN_ON_ONCE(!pte_none(*new_ptep));
+
+ nr_ptes = 1;
+ max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT;
+ old_pte = ptep_get(old_ptep);
+ if (pte_none(old_pte))
continue;
- pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
* If we are remapping a valid PTE, make sure
* to flush TLB before we drop the PTL for the
@@ -213,13 +279,17 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* the TLB entry for the old mapping has been
* flushed.
*/
- if (pte_present(pte))
+ if (pte_present(old_pte)) {
+ nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep,
+ old_pte, max_nr_ptes);
force_flush = true;
+ }
+ pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
- pte_clear(mm, new_addr, new_pte);
+ pte_clear(mm, new_addr, new_ptep);
else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
@@ -227,7 +297,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
- set_pte_at(mm, new_addr, new_pte, pte);
+ set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
}
}
@@ -236,10 +306,10 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
flush_tlb_range(vma, old_end - len, old_end);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap(new_pte - 1);
- pte_unmap_unlock(old_pte - 1, old_ptl);
+ pte_unmap(new_ptep - 1);
+ pte_unmap_unlock(old_ptep - 1, old_ptl);
out:
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
drop_rmap_locks(vma);
return err;
}
@@ -253,17 +323,39 @@ static inline bool arch_supports_page_table_move(void)
}
#endif
+static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc)
+{
+ /*
+ * If we are moving a VMA that has uffd-wp registered but with
+ * remap events disabled (new VMA will not be registered with uffd), we
+ * need to ensure that the uffd-wp state is cleared from all pgtables.
+ * This means recursing into lower page tables in move_page_tables().
+ *
+ * We might get called with VMAs reversed when recovering from a
+ * failed page table move. In that case, the
+ * "old"-but-actually-"originally new" VMA during recovery will not have
+ * a uffd context. Recursing into lower page tables during the original
+ * move but not during the recovery move will cause trouble, because we
+ * run into already-existing page tables. So check both VMAs.
+ */
+ return !vma_has_uffd_without_event_remap(pmc->old) &&
+ !vma_has_uffd_without_event_remap(pmc->new);
+}
+
#ifdef CONFIG_HAVE_MOVE_PMD
-static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
+static bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
bool res = false;
pmd_t pmd;
if (!arch_supports_page_table_move())
return false;
+ if (!uffd_supports_page_table_move(pmc))
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have released it.
@@ -290,20 +382,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
- /* If this pmd belongs to a uffd vma with remap events disabled, we need
- * to ensure that the uffd-wp state is cleared from all pgtables. This
- * means recursing into lower page tables in move_page_tables(), and we
- * can reuse the existing code if we simply treat the entry as "not
- * moved".
- */
- if (vma_has_uffd_without_event_remap(vma))
- return false;
-
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ old_ptl = pmd_lock(mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -320,7 +403,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE);
out_unlock:
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -329,24 +412,26 @@ out_unlock:
return res;
}
#else
-static inline bool move_normal_pmd(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
- pmd_t *new_pmd)
+static inline bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
-static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
if (!arch_supports_page_table_move())
return false;
+ if (!uffd_supports_page_table_move(pmc))
+ return false;
/*
* The destination pud shouldn't be established, free_pgtables()
* should have released it.
@@ -354,20 +439,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
- /* If this pud belongs to a uffd vma with remap events disabled, we need
- * to ensure that the uffd-wp state is cleared from all pgtables. This
- * means recursing into lower page tables in move_page_tables(), and we
- * can reuse the existing code if we simply treat the entry as "not
- * moved".
- */
- if (vma_has_uffd_without_event_remap(vma))
- return false;
-
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -379,7 +455,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
pud_populate(mm, new_pud, pud_pgtable(pud));
- flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -387,19 +463,19 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static inline bool move_normal_pud(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
- pud_t *new_pud)
+static inline bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
return false;
}
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -414,7 +490,7 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -427,8 +503,8 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
/* Set the new pud */
/* mark soft_ditry when we add pud level soft dirty support */
- set_pud_at(mm, new_addr, new_pud, pud);
- flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+ set_pud_at(mm, pmc->new_addr, new_pud, pud);
+ flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -436,8 +512,9 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
+
{
WARN_ON_ONCE(1);
return false;
@@ -458,10 +535,12 @@ enum pgt_entry {
* destination pgt_entry.
*/
static __always_inline unsigned long get_extent(enum pgt_entry entry,
- unsigned long old_addr, unsigned long old_end,
- unsigned long new_addr)
+ struct pagetable_move_control *pmc)
{
unsigned long next, extent, mask, size;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long old_end = pmc->old_end;
+ unsigned long new_addr = pmc->new_addr;
switch (entry) {
case HPAGE_PMD:
@@ -491,37 +570,50 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
}
/*
+ * Should move_pgt_entry() acquire the rmap locks? This is either expressed in
+ * the PMC, or overridden in the case of normal, larger page tables.
+ */
+static bool should_take_rmap_locks(struct pagetable_move_control *pmc,
+ enum pgt_entry entry)
+{
+ switch (entry) {
+ case NORMAL_PMD:
+ case NORMAL_PUD:
+ return true;
+ default:
+ return pmc->need_rmap_locks;
+ }
+}
+
+/*
* Attempts to speedup the move by moving entry at the level corresponding to
* pgt_entry. Returns true if the move was successful, else false.
*/
-static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr,
- void *old_entry, void *new_entry, bool need_rmap_locks)
+static bool move_pgt_entry(struct pagetable_move_control *pmc,
+ enum pgt_entry entry, void *old_entry, void *new_entry)
{
bool moved = false;
+ bool need_rmap_locks = should_take_rmap_locks(pmc, entry);
/* See comment in move_ptes() */
if (need_rmap_locks)
- take_rmap_locks(vma);
+ take_rmap_locks(pmc->old);
switch (entry) {
case NORMAL_PMD:
- moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pmd(pmc, old_entry, new_entry);
break;
case NORMAL_PUD:
- moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pud(pmc, old_entry, new_entry);
break;
case HPAGE_PMD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pmd(vma, old_addr, new_addr, old_entry,
+ move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry,
new_entry);
break;
case HPAGE_PUD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ move_huge_pud(pmc, old_entry, new_entry);
break;
default:
@@ -530,7 +622,7 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
}
if (need_rmap_locks)
- drop_rmap_locks(vma);
+ drop_rmap_locks(pmc->old);
return moved;
}
@@ -541,8 +633,9 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
* the VMA that is created to span the source and destination of the move,
* so we make an exception for it.
*/
-static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
- unsigned long mask, bool for_stack)
+static bool can_align_down(struct pagetable_move_control *pmc,
+ struct vm_area_struct *vma, unsigned long addr_to_align,
+ unsigned long mask)
{
unsigned long addr_masked = addr_to_align & mask;
@@ -551,11 +644,11 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
* of the corresponding VMA, we can't align down or we will destroy part
* of the current mapping.
*/
- if (!for_stack && vma->vm_start != addr_to_align)
+ if (!pmc->for_stack && vma->vm_start != addr_to_align)
return false;
/* In the stack case we explicitly permit in-VMA alignment. */
- if (for_stack && addr_masked >= vma->vm_start)
+ if (pmc->for_stack && addr_masked >= vma->vm_start)
return true;
/*
@@ -565,163 +658,391 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
}
-/* Opportunistically realign to specified boundary for faster copy. */
-static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
- unsigned long *new_addr, struct vm_area_struct *new_vma,
- unsigned long mask, bool for_stack)
+/*
+ * Determine if are in fact able to realign for efficiency to a higher page
+ * table boundary.
+ */
+static bool can_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
{
+ unsigned long align_mask = ~pagetable_mask;
+ unsigned long old_align = pmc->old_addr & align_mask;
+ unsigned long new_align = pmc->new_addr & align_mask;
+ unsigned long pagetable_size = align_mask + 1;
+ unsigned long old_align_next = pagetable_size - old_align;
+
+ /*
+ * We don't want to have to go hunting for VMAs from the end of the old
+ * VMA to the next page table boundary, also we want to make sure the
+ * operation is wortwhile.
+ *
+ * So ensure that we only perform this realignment if the end of the
+ * range being copied reaches or crosses the page table boundary.
+ *
+ * boundary boundary
+ * .<- old_align -> .
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . <----------------.----------->
+ * . len_in
+ * <------------------------------->
+ * . pagetable_size .
+ * . <---------------->
+ * . old_align_next .
+ */
+ if (pmc->len_in < old_align_next)
+ return false;
+
/* Skip if the addresses are already aligned. */
- if ((*old_addr & ~mask) == 0)
- return;
+ if (old_align == 0)
+ return false;
/* Only realign if the new and old addresses are mutually aligned. */
- if ((*old_addr & ~mask) != (*new_addr & ~mask))
- return;
+ if (old_align != new_align)
+ return false;
/* Ensure realignment doesn't cause overlap with existing mappings. */
- if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
- !can_align_down(new_vma, *new_addr, mask, for_stack))
+ if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
+ !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
+ return false;
+
+ return true;
+}
+
+/*
+ * Opportunistically realign to specified boundary for faster copy.
+ *
+ * Consider an mremap() of a VMA with page table boundaries as below, and no
+ * preceding VMAs from the lower page table boundary to the start of the VMA,
+ * with the end of the range reaching or crossing the page table boundary.
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . pmc->old_addr . pmc->old_end
+ * . <---------------------------->
+ * . move these page tables
+ *
+ * If we proceed with moving page tables in this scenario, we will have a lot of
+ * work to do traversing old page tables and establishing new ones in the
+ * destination across multiple lower level page tables.
+ *
+ * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
+ * page table boundary, so we can simply copy a single page table entry for the
+ * aligned portion of the VMA instead:
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * pmc->old_addr . pmc->old_end
+ * <------------------------------------------->
+ * . move these page tables
+ */
+static void try_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
+{
+
+ if (!can_realign_addr(pmc, pagetable_mask))
return;
- *old_addr = *old_addr & mask;
- *new_addr = *new_addr & mask;
+ /*
+ * Simply align to page table boundaries. Note that we do NOT update the
+ * pmc->old_end value, and since the move_page_tables() operation spans
+ * from [old_addr, old_end) (offsetting new_addr as it is performed),
+ * this simply changes the start of the copy, not the end.
+ */
+ pmc->old_addr &= pagetable_mask;
+ pmc->new_addr &= pagetable_mask;
+}
+
+/* Is the page table move operation done? */
+static bool pmc_done(struct pagetable_move_control *pmc)
+{
+ return pmc->old_addr >= pmc->old_end;
}
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack)
+/* Advance to the next page table, offset by extent bytes. */
+static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent)
{
- unsigned long extent, old_end;
+ pmc->old_addr += extent;
+ pmc->new_addr += extent;
+}
+
+/*
+ * Determine how many bytes in the specified input range have had their page
+ * tables moved so far.
+ */
+static unsigned long pmc_progress(struct pagetable_move_control *pmc)
+{
+ unsigned long orig_old_addr = pmc->old_end - pmc->len_in;
+ unsigned long old_addr = pmc->old_addr;
+
+ /*
+ * Prevent negative return values when {old,new}_addr was realigned but
+ * we broke out of the loop in move_page_tables() for the first PMD
+ * itself.
+ */
+ return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr;
+}
+
+unsigned long move_page_tables(struct pagetable_move_control *pmc)
+{
+ unsigned long extent;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
pud_t *old_pud, *new_pud;
+ struct mm_struct *mm = pmc->old->vm_mm;
- if (!len)
+ if (!pmc->len_in)
return 0;
- old_end = old_addr + len;
-
- if (is_vm_hugetlb_page(vma))
- return move_hugetlb_page_tables(vma, new_vma, old_addr,
- new_addr, len);
+ if (is_vm_hugetlb_page(pmc->old))
+ return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
+ pmc->new_addr, pmc->len_in);
/*
* If possible, realign addresses to PMD boundary for faster copy.
* Only realign if the mremap copying hits a PMD boundary.
*/
- if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
- try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
- for_stack);
+ try_realign_addr(pmc, PMD_MASK);
- flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
- old_addr, old_end);
+ flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm,
+ pmc->old_addr, pmc->old_end);
mmu_notifier_invalidate_range_start(&range);
- for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
+ for (; !pmc_done(pmc); pmc_next(pmc, extent)) {
cond_resched();
/*
* If extent is PUD-sized try to speed up the move by moving at the
* PUD level if possible.
*/
- extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+ extent = get_extent(NORMAL_PUD, pmc);
- old_pud = get_old_pud(vma->vm_mm, old_addr);
+ old_pud = get_old_pud(mm, pmc->old_addr);
if (!old_pud)
continue;
- new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ new_pud = alloc_new_pud(mm, pmc->new_addr);
if (!new_pud)
break;
- if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+ if (pud_trans_huge(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
- move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, need_rmap_locks);
+ move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
/* We ignore and continue on error? */
continue;
}
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-
- if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, true))
+ if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud))
continue;
}
- extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
- old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ extent = get_extent(NORMAL_PMD, pmc);
+ old_pmd = get_old_pmd(mm, pmc->old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
+ new_pmd = alloc_new_pmd(mm, pmc->new_addr);
if (!new_pmd)
break;
again:
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
- pmd_devmap(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
- move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, need_rmap_locks))
+ move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
- split_huge_pmd(vma, old_pmd, old_addr);
+ split_huge_pmd(pmc->old, old_pmd, pmc->old_addr);
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
extent == PMD_SIZE) {
/*
* If the extent is PMD-sized, try to speed the move by
* moving at the PMD level if possible.
*/
- if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, true))
+ if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd))
continue;
}
if (pmd_none(*old_pmd))
continue;
- if (pte_alloc(new_vma->vm_mm, new_pmd))
+ if (pte_alloc(pmc->new->vm_mm, new_pmd))
break;
- if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0)
goto again;
}
mmu_notifier_invalidate_range_end(&range);
+ return pmc_progress(pmc);
+}
+
+/* Set vrm->delta to the difference in VMA size specified by user. */
+static void vrm_set_delta(struct vma_remap_struct *vrm)
+{
+ vrm->delta = abs_diff(vrm->old_len, vrm->new_len);
+}
+
+/* Determine what kind of remap this is - shrink, expand or no resize at all. */
+static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm)
+{
+ if (vrm->delta == 0)
+ return MREMAP_NO_RESIZE;
+
+ if (vrm->old_len > vrm->new_len)
+ return MREMAP_SHRINK;
+
+ return MREMAP_EXPAND;
+}
+
+/*
+ * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs
+ * overlapping?
+ */
+static bool vrm_overlaps(struct vma_remap_struct *vrm)
+{
+ unsigned long start_old = vrm->addr;
+ unsigned long start_new = vrm->new_addr;
+ unsigned long end_old = vrm->addr + vrm->old_len;
+ unsigned long end_new = vrm->new_addr + vrm->new_len;
+
/*
- * Prevent negative return values when {old,new}_addr was realigned
- * but we broke out of the above loop for the first PMD itself.
+ * start_old end_old
+ * |-----------|
+ * | |
+ * |-----------|
+ * |-------------|
+ * | |
+ * |-------------|
+ * start_new end_new
*/
- if (old_addr < old_end - len)
- return 0;
+ if (end_old > start_new && end_new > start_old)
+ return true;
- return len + old_addr - old_end; /* how much done */
+ return false;
}
-static unsigned long move_vma(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr,
- bool *locked, unsigned long flags,
- struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
+/*
+ * Will a new address definitely be assigned? This either if the user specifies
+ * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will
+ * always detemrine a target address.
+ */
+static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
{
- long to_account = new_len - old_len;
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *new_vma;
- unsigned long vm_flags = vma->vm_flags;
- unsigned long new_pgoff;
- unsigned long moved_len;
- unsigned long account_start = 0;
- unsigned long account_end = 0;
- unsigned long hiwater_vm;
- int err = 0;
- bool need_rmap_locks;
- struct vma_iterator vmi;
+ return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
+}
+
+/*
+ * Find an unmapped area for the requested vrm->new_addr.
+ *
+ * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only
+ * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to
+ * mmap(), otherwise this is equivalent to mmap() specifying a NULL address.
+ *
+ * Returns 0 on success (with vrm->new_addr updated), or an error code upon
+ * failure.
+ */
+static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long map_flags = 0;
+ /* Page Offset _into_ the VMA. */
+ pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff_t pgoff = vma->vm_pgoff + internal_pgoff;
+ unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0;
+ unsigned long res;
+
+ if (vrm->flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff,
+ map_flags);
+ if (IS_ERR_VALUE(res))
+ return res;
+
+ vrm->new_addr = res;
+ return 0;
+}
+
+/*
+ * Keep track of pages which have been added to the memory mapping. If the VMA
+ * is accounted, also check to see if there is sufficient memory.
+ *
+ * Returns true on success, false if insufficient memory to charge.
+ */
+static bool vrm_calc_charge(struct vma_remap_struct *vrm)
+{
+ unsigned long charged;
+
+ if (!(vrm->vma->vm_flags & VM_ACCOUNT))
+ return true;
+
+ /*
+ * If we don't unmap the old mapping, then we account the entirety of
+ * the length of the new one. Otherwise it's just the delta in size.
+ */
+ if (vrm->flags & MREMAP_DONTUNMAP)
+ charged = vrm->new_len >> PAGE_SHIFT;
+ else
+ charged = vrm->delta >> PAGE_SHIFT;
+
+
+ /* This accounts 'charged' pages of memory. */
+ if (security_vm_enough_memory_mm(current->mm, charged))
+ return false;
+
+ vrm->charged = charged;
+ return true;
+}
+
+/*
+ * an error has occurred so we will not be using vrm->charged memory. Unaccount
+ * this memory if the VMA is accounted.
+ */
+static void vrm_uncharge(struct vma_remap_struct *vrm)
+{
+ if (!(vrm->vma->vm_flags & VM_ACCOUNT))
+ return;
+
+ vm_unacct_memory(vrm->charged);
+ vrm->charged = 0;
+}
+
+/*
+ * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to
+ * account for 'bytes' memory used, and if locked, indicate this in the VRM so
+ * we can handle this correctly later.
+ */
+static void vrm_stat_account(struct vma_remap_struct *vrm,
+ unsigned long bytes)
+{
+ unsigned long pages = bytes >> PAGE_SHIFT;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+
+ vm_stat_account(mm, vma->vm_flags, pages);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += pages;
+}
+
+/*
+ * Perform checks before attempting to write a VMA prior to it being
+ * moved.
+ */
+static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
+{
+ unsigned long err = 0;
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long old_addr = vrm->addr;
+ unsigned long old_len = vrm->old_len;
+ vm_flags_t dummy = vma->vm_flags;
/*
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
- if (mm->map_count >= sysctl_max_map_count - 3)
+ if (current->mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
- if (unlikely(flags & MREMAP_DONTUNMAP))
- to_account = new_len;
-
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
err = vma->vm_ops->may_split(vma, old_addr);
@@ -739,61 +1060,239 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* so KSM can come around to merge on vma and new_vma afterwards.
*/
err = ksm_madvise(vma, old_addr, old_addr + old_len,
- MADV_UNMERGEABLE, &vm_flags);
+ MADV_UNMERGEABLE, &dummy);
if (err)
return err;
- if (vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
- return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Unmap source VMA for VMA move, turning it from a copy to a move, being
+ * careful to ensure we do not underflow memory account while doing so if an
+ * accountable move.
+ *
+ * This is best effort, if we fail to unmap then we simply try to correct
+ * accounting and exit.
+ */
+static void unmap_source_vma(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = vrm->addr;
+ unsigned long len = vrm->old_len;
+ struct vm_area_struct *vma = vrm->vma;
+ VMA_ITERATOR(vmi, mm, addr);
+ int err;
+ unsigned long vm_start;
+ unsigned long vm_end;
+ /*
+ * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
+ * function implies that we unmap the original VMA, which seems
+ * contradictory.
+ *
+ * However, this occurs when this operation was attempted and an error
+ * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
+ * we actually _do_ want it be unaccounted.
+ */
+ bool accountable_move = (vma->vm_flags & VM_ACCOUNT) &&
+ !(vrm->flags & MREMAP_DONTUNMAP);
+
+ /*
+ * So we perform a trick here to prevent incorrect accounting. Any merge
+ * or new VMA allocation performed in copy_vma() does not adjust
+ * accounting, it is expected that callers handle this.
+ *
+ * And indeed we already have, accounting appropriately in the case of
+ * both in vrm_charge().
+ *
+ * However, when we unmap the existing VMA (to effect the move), this
+ * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
+ * removed pages.
+ *
+ * To avoid this we temporarily clear this flag, reinstating on any
+ * portions of the original VMA that remain.
+ */
+ if (accountable_move) {
+ vm_flags_clear(vma, VM_ACCOUNT);
+ /* We are about to split vma, so store the start/end. */
+ vm_start = vma->vm_start;
+ vm_end = vma->vm_end;
+ }
+
+ err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
+ vrm->vma = NULL; /* Invalidated. */
+ vrm->vmi_needs_invalidate = true;
+ if (err) {
+ /* OOM: unable to split vma, just get accounts right */
+ vm_acct_memory(len >> PAGE_SHIFT);
+ return;
}
- vma_start_write(vma);
- new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
- &need_rmap_locks);
+ /*
+ * If we mremap() from a VMA like this:
+ *
+ * addr end
+ * | |
+ * v v
+ * |-------------|
+ * | |
+ * |-------------|
+ *
+ * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
+ * we'll end up with:
+ *
+ * addr end
+ * | |
+ * v v
+ * |---| |---|
+ * | A | | B |
+ * |---| |---|
+ *
+ * The VMI is still pointing at addr, so vma_prev() will give us A, and
+ * a subsequent or lone vma_next() will give as B.
+ *
+ * do_vmi_munmap() will have restored the VMI back to addr.
+ */
+ if (accountable_move) {
+ unsigned long end = addr + len;
+
+ if (vm_start < addr) {
+ struct vm_area_struct *prev = vma_prev(&vmi);
+
+ vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */
+ }
+
+ if (vm_end > end) {
+ struct vm_area_struct *next = vma_next(&vmi);
+
+ vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */
+ }
+ }
+}
+
+/*
+ * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
+ * process. Additionally handle an error occurring on moving of page tables,
+ * where we reset vrm state to cause unmapping of the new VMA.
+ *
+ * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
+ * error code.
+ */
+static int copy_vma_and_data(struct vma_remap_struct *vrm,
+ struct vm_area_struct **new_vma_ptr)
+{
+ unsigned long internal_offset = vrm->addr - vrm->vma->vm_start;
+ unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
+ unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
+ unsigned long moved_len;
+ struct vm_area_struct *vma = vrm->vma;
+ struct vm_area_struct *new_vma;
+ int err = 0;
+ PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
+
+ new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
+ &pmc.need_rmap_locks);
if (!new_vma) {
- if (vm_flags & VM_ACCOUNT)
- vm_unacct_memory(to_account >> PAGE_SHIFT);
+ vrm_uncharge(vrm);
+ *new_vma_ptr = NULL;
return -ENOMEM;
}
+ /* By merging, we may have invalidated any iterator in use. */
+ if (vma != vrm->vma)
+ vrm->vmi_needs_invalidate = true;
+
+ vrm->vma = vma;
+ pmc.old = vma;
+ pmc.new = new_vma;
- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
- need_rmap_locks, false);
- if (moved_len < old_len) {
+ moved_len = move_page_tables(&pmc);
+ if (moved_len < vrm->old_len)
err = -ENOMEM;
- } else if (vma->vm_ops && vma->vm_ops->mremap) {
+ else if (vma->vm_ops && vma->vm_ops->mremap)
err = vma->vm_ops->mremap(new_vma);
- }
if (unlikely(err)) {
+ PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
+ vrm->addr, moved_len);
+
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
- true, false);
- vma = new_vma;
- old_len = new_len;
- old_addr = new_addr;
- new_addr = err;
+ pmc_revert.need_rmap_locks = true;
+ move_page_tables(&pmc_revert);
+
+ vrm->vma = new_vma;
+ vrm->old_len = vrm->new_len;
+ vrm->addr = vrm->new_addr;
} else {
- mremap_userfaultfd_prep(new_vma, uf);
+ mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma)) {
- clear_vma_resv_huge_pages(vma);
- }
+ fixup_hugetlb_reservations(vma);
- /* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
- vm_flags_clear(vma, VM_ACCOUNT);
- if (vma->vm_start < old_addr)
- account_start = vma->vm_start;
- if (vma->vm_end > old_addr + old_len)
- account_end = vma->vm_end;
- }
+ *new_vma_ptr = new_vma;
+ return err;
+}
+
+/*
+ * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() flag on
+ * remaining VMA by convention (it cannot be mlock()'d any longer, as pages in
+ * range are no longer mapped), and removing anon_vma_chain links from it if the
+ * entire VMA was copied over.
+ */
+static void dontunmap_complete(struct vma_remap_struct *vrm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long start = vrm->addr;
+ unsigned long end = vrm->addr + vrm->old_len;
+ unsigned long old_start = vrm->vma->vm_start;
+ unsigned long old_end = vrm->vma->vm_end;
+
+ /* We always clear VM_LOCKED[ONFAULT] on the old VMA. */
+ vm_flags_clear(vrm->vma, VM_LOCKED_MASK);
+
+ /*
+ * anon_vma links of the old vma is no longer needed after its page
+ * table has been moved.
+ */
+ if (new_vma != vrm->vma && start == old_start && end == old_end)
+ unlink_anon_vmas(vrm->vma);
+
+ /* Because we won't unmap we don't need to touch locked_vm. */
+}
+
+static unsigned long move_vma(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *new_vma;
+ unsigned long hiwater_vm;
+ int err;
+
+ err = prep_move_vma(vrm);
+ if (err)
+ return err;
+
+ /*
+ * If accounted, determine the number of bytes the operation will
+ * charge.
+ */
+ if (!vrm_calc_charge(vrm))
+ return -ENOMEM;
+
+ /* We don't want racing faults. */
+ vma_start_write(vrm->vma);
+
+ /* Perform copy step. */
+ err = copy_vma_and_data(vrm, &new_vma);
+ /*
+ * If we established the copied-to VMA, we attempt to recover from the
+ * error by setting the destination VMA to the source VMA and unmapping
+ * it below.
+ */
+ if (err && !new_vma)
+ return err;
/*
* If we failed to move page tables we still do total_vm increment
@@ -805,74 +1304,380 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
- /* Tell pfnmap has moved from this vma */
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(vma);
+ vrm_stat_account(vrm, vrm->new_len);
+ if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP)))
+ dontunmap_complete(vrm, new_vma);
+ else
+ unmap_source_vma(vrm);
+
+ mm->hiwater_vm = hiwater_vm;
+
+ return err ? (unsigned long)err : vrm->new_addr;
+}
+
+/*
+ * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
+ * execute this, optionally dropping the mmap lock when we do so.
+ *
+ * In both cases this invalidates the VMA, however if we don't drop the lock,
+ * then load the correct VMA into vrm->vma afterwards.
+ */
+static unsigned long shrink_vma(struct vma_remap_struct *vrm,
+ bool drop_lock)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long unmap_start = vrm->addr + vrm->new_len;
+ unsigned long unmap_bytes = vrm->delta;
+ unsigned long res;
+ VMA_ITERATOR(vmi, mm, unmap_start);
+
+ VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK);
+
+ res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes,
+ vrm->uf_unmap, drop_lock);
+ vrm->vma = NULL; /* Invalidated. */
+ if (res)
+ return res;
- if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
- /* We always clear VM_LOCKED[ONFAULT] on the old vma */
- vm_flags_clear(vma, VM_LOCKED_MASK);
+ /*
+ * If we've not dropped the lock, then we should reload the VMA to
+ * replace the invalidated VMA with the one that may have now been
+ * split.
+ */
+ if (drop_lock) {
+ vrm->mmap_locked = false;
+ } else {
+ vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vrm->vma)
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * mremap_to() - remap a vma to a new location.
+ * Returns: The new address of the vma or an error.
+ */
+static unsigned long mremap_to(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long err;
+
+ if (vrm->flags & MREMAP_FIXED) {
/*
- * anon_vma links of the old vma is no longer needed after its page
- * table has been moved.
+ * In mremap_to().
+ * VMA is moved to dst address, and munmap dst first.
+ * do_munmap will check if dst is sealed.
*/
- if (new_vma != vma && vma->vm_start == old_addr &&
- vma->vm_end == (old_addr + old_len))
- unlink_anon_vmas(vma);
+ err = do_munmap(mm, vrm->new_addr, vrm->new_len,
+ vrm->uf_unmap_early);
+ vrm->vma = NULL; /* Invalidated. */
+ vrm->vmi_needs_invalidate = true;
+ if (err)
+ return err;
- /* Because we won't unmap we don't need to touch locked_vm */
- return new_addr;
+ /*
+ * If we remap a portion of a VMA elsewhere in the same VMA,
+ * this can invalidate the old VMA. Reset.
+ */
+ vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vrm->vma)
+ return -EFAULT;
}
- vma_iter_init(&vmi, mm, old_addr);
- if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
- /* OOM: unable to split vma, just get accounts right */
- if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
- vm_acct_memory(old_len >> PAGE_SHIFT);
- account_start = account_end = 0;
- }
+ if (vrm->remap_type == MREMAP_SHRINK) {
+ err = shrink_vma(vrm, /* drop_lock= */false);
+ if (err)
+ return err;
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
+ /* Set up for the move now shrink has been executed. */
+ vrm->old_len = vrm->new_len;
}
- mm->hiwater_vm = hiwater_vm;
+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
+ if (vrm->flags & MREMAP_DONTUNMAP) {
+ vm_flags_t vm_flags = vrm->vma->vm_flags;
+ unsigned long pages = vrm->old_len >> PAGE_SHIFT;
- /* Restore VM_ACCOUNT if one or two pieces of vma left */
- if (account_start) {
- vma = vma_prev(&vmi);
- vm_flags_set(vma, VM_ACCOUNT);
+ if (!may_expand_vm(mm, vm_flags, pages))
+ return -ENOMEM;
}
- if (account_end) {
- vma = vma_next(&vmi);
- vm_flags_set(vma, VM_ACCOUNT);
- }
+ err = vrm_set_new_addr(vrm);
+ if (err)
+ return err;
+
+ return move_vma(vrm);
+}
- return new_addr;
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+ unsigned long end = vma->vm_end + delta;
+
+ if (end < vma->vm_end) /* overflow */
+ return 0;
+ if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
+ return 0;
+ if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+ 0, MAP_FIXED) & ~PAGE_MASK)
+ return 0;
+ return 1;
+}
+
+/* Determine whether we are actually able to execute an in-place expansion. */
+static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
+{
+ /* Number of bytes from vrm->addr to end of VMA. */
+ unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr;
+
+ /* If end of range aligns to end of VMA, we can just expand in-place. */
+ if (suffix_bytes != vrm->old_len)
+ return false;
+
+ /* Check whether this is feasible. */
+ if (!vma_expandable(vrm->vma, vrm->delta))
+ return false;
+
+ return true;
}
/*
- * resize_is_valid() - Ensure the vma can be resized to the new length at the give
- * address.
+ * We know we can expand the VMA in-place by delta pages, so do so.
*
- * @vma: The vma to resize
- * @addr: The old address
- * @old_len: The current size
- * @new_len: The desired size
- * @flags: The vma flags
+ * If we discover the VMA is locked, update mm_struct statistics accordingly and
+ * indicate so to the caller.
+ */
+static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+
+ if (!vrm_calc_charge(vrm))
+ return -ENOMEM;
+
+ /*
+ * Function vma_merge_extend() is called on the
+ * extension we are adding to the already existing vma,
+ * vma_merge_extend() will merge this extension with the
+ * already existing vma (expand operation itself) and
+ * possibly also with the next vma if it becomes
+ * adjacent to the expanded vma and otherwise
+ * compatible.
+ */
+ vma = vma_merge_extend(&vmi, vma, vrm->delta);
+ if (!vma) {
+ vrm_uncharge(vrm);
+ return -ENOMEM;
+ }
+ vrm->vma = vma;
+
+ vrm_stat_account(vrm, vrm->delta);
+
+ return 0;
+}
+
+static bool align_hugetlb(struct vma_remap_struct *vrm)
+{
+ struct hstate *h __maybe_unused = hstate_vma(vrm->vma);
+
+ vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h));
+ vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h));
+
+ /* addrs must be huge page aligned */
+ if (vrm->addr & ~huge_page_mask(h))
+ return false;
+ if (vrm->new_addr & ~huge_page_mask(h))
+ return false;
+
+ /*
+ * Don't allow remap expansion, because the underlying hugetlb
+ * reservation is not yet capable to handle split reservation.
+ */
+ if (vrm->new_len > vrm->old_len)
+ return false;
+
+ return true;
+}
+
+/*
+ * We are mremap()'ing without specifying a fixed address to move to, but are
+ * requesting that the VMA's size be increased.
*
- * Return 0 on success, error otherwise.
+ * Try to do so in-place, if this fails, then move the VMA to a new location to
+ * action the change.
+ */
+static unsigned long expand_vma(struct vma_remap_struct *vrm)
+{
+ unsigned long err;
+
+ /*
+ * [addr, old_len) spans precisely to the end of the VMA, so try to
+ * expand it in-place.
+ */
+ if (vrm_can_expand_in_place(vrm)) {
+ err = expand_vma_in_place(vrm);
+ if (err)
+ return err;
+
+ /* OK we're done! */
+ return vrm->addr;
+ }
+
+ /*
+ * We weren't able to just expand or shrink the area,
+ * we need to create a new one and move it.
+ */
+
+ /* We're not allowed to move the VMA, so error out. */
+ if (!(vrm->flags & MREMAP_MAYMOVE))
+ return -ENOMEM;
+
+ /* Find a new location to move the VMA to. */
+ err = vrm_set_new_addr(vrm);
+ if (err)
+ return err;
+
+ return move_vma(vrm);
+}
+
+/*
+ * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the
+ * first available address to perform the operation.
+ */
+static unsigned long mremap_at(struct vma_remap_struct *vrm)
+{
+ unsigned long res;
+
+ switch (vrm->remap_type) {
+ case MREMAP_INVALID:
+ break;
+ case MREMAP_NO_RESIZE:
+ /* NO-OP CASE - resizing to the same size. */
+ return vrm->addr;
+ case MREMAP_SHRINK:
+ /*
+ * SHRINK CASE. Can always be done in-place.
+ *
+ * Simply unmap the shrunken portion of the VMA. This does all
+ * the needed commit accounting, and we indicate that the mmap
+ * lock should be dropped.
+ */
+ res = shrink_vma(vrm, /* drop_lock= */true);
+ if (res)
+ return res;
+
+ return vrm->addr;
+ case MREMAP_EXPAND:
+ return expand_vma(vrm);
+ }
+
+ /* Should not be possible. */
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
+/*
+ * Will this operation result in the VMA being expanded or moved and thus need
+ * to map a new portion of virtual address space?
*/
-static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long flags)
+static bool vrm_will_map_new(struct vma_remap_struct *vrm)
+{
+ if (vrm->remap_type == MREMAP_EXPAND)
+ return true;
+
+ if (vrm_implies_new_addr(vrm))
+ return true;
+
+ return false;
+}
+
+/* Does this remap ONLY move mappings? */
+static bool vrm_move_only(struct vma_remap_struct *vrm)
+{
+ if (!(vrm->flags & MREMAP_FIXED))
+ return false;
+
+ if (vrm->old_len != vrm->new_len)
+ return false;
+
+ return true;
+}
+
+static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
{
struct mm_struct *mm = current->mm;
- unsigned long pgoff;
+
+ /* Regardless of success/failure, we always notify of any unmaps. */
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
+ if (failed)
+ mremap_userfaultfd_fail(vrm->uf);
+ else
+ mremap_userfaultfd_complete(vrm->uf, vrm->addr,
+ vrm->new_addr, vrm->old_len);
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+}
+
+static bool vma_multi_allowed(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+
+ /*
+ * We can't support moving multiple uffd VMAs as notify requires
+ * mmap lock to be dropped.
+ */
+ if (userfaultfd_armed(vma))
+ return false;
+
+ /*
+ * Custom get unmapped area might result in MREMAP_FIXED not
+ * being obeyed.
+ */
+ if (!file || !file->f_op->get_unmapped_area)
+ return true;
+ /* Known good. */
+ if (vma_is_shmem(vma))
+ return true;
+ if (is_vm_hugetlb_page(vma))
+ return true;
+ if (file->f_op->get_unmapped_area == thp_get_unmapped_area)
+ return true;
+
+ return false;
+}
+
+static int check_prep_vma(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma = vrm->vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = vrm->addr;
+ unsigned long old_len, new_len, pgoff;
+
+ if (!vma)
+ return -EFAULT;
+
+ /* If mseal()'d, mremap() is prohibited. */
+ if (vma_is_sealed(vma))
+ return -EPERM;
+
+ /* Align to hugetlb page size, if required. */
+ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm))
+ return -EINVAL;
+
+ vrm_set_delta(vrm);
+ vrm->remap_type = vrm_remap_type(vrm);
+ /* For convenience, we set new_addr even if VMA won't move. */
+ if (!vrm_implies_new_addr(vrm))
+ vrm->new_addr = addr;
+
+ /* Below only meaningful if we expand or move a VMA. */
+ if (!vrm_will_map_new(vrm))
+ return 0;
+
+ old_len = vrm->old_len;
+ new_len = vrm->new_len;
/*
* !old_len is a special case where an attempt is made to 'duplicate'
@@ -883,21 +1688,45 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
* behavior. As a result, fail such attempts.
*/
if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
- pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
+ current->comm, current->pid);
return -EINVAL;
}
- if ((flags & MREMAP_DONTUNMAP) &&
+ if ((vrm->flags & MREMAP_DONTUNMAP) &&
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return -EINVAL;
- /* We can't remap across vm area boundaries */
+ /*
+ * We permit crossing of boundaries for the range being unmapped due to
+ * a shrink.
+ */
+ if (vrm->remap_type == MREMAP_SHRINK)
+ old_len = new_len;
+
+ /*
+ * We can't remap across the end of VMAs, as another VMA may be
+ * adjacent:
+ *
+ * addr vma->vm_end
+ * |-----.----------|
+ * | . |
+ * |-----.----------|
+ * .<--------->xxx>
+ * old_len
+ *
+ * We also require that vma->vm_start <= addr < vma->vm_end.
+ */
if (old_len > vma->vm_end - addr)
return -EFAULT;
if (new_len == old_len)
return 0;
+ /* We are expanding and the VMA is mlock()'d so we need to populate. */
+ if (vma->vm_flags & VM_LOCKED)
+ vrm->populate_expand = true;
+
/* Need to be careful about a growing mapping */
pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
@@ -907,49 +1736,67 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return -EFAULT;
- if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
+ if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
return -EAGAIN;
- if (!may_expand_vm(mm, vma->vm_flags,
- (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
return -ENOMEM;
return 0;
}
/*
- * mremap_to() - remap a vma to a new location
- * @addr: The old address
- * @old_len: The old size
- * @new_addr: The target address
- * @new_len: The new size
- * @locked: If the returned vma is locked (VM_LOCKED)
- * @flags: the mremap flags
- * @uf: The mremap userfaultfd context
- * @uf_unmap_early: The userfaultfd unmap early context
- * @uf_unmap: The userfaultfd unmap context
- *
- * Returns: The new address of the vma or an error.
+ * Are the parameters passed to mremap() valid? If so return 0, otherwise return
+ * error.
*/
-static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked,
- unsigned long flags, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap_early,
- struct list_head *uf_unmap)
+static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
+
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret;
- unsigned long map_flags = 0;
+ unsigned long addr = vrm->addr;
+ unsigned long flags = vrm->flags;
- if (offset_in_page(new_addr))
+ /* Ensure no unexpected flag values. */
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
+ return -EINVAL;
+
+ /* Start address must be page-aligned. */
+ if (offset_in_page(addr))
+ return -EINVAL;
+
+ /*
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
+ */
+ if (!vrm->new_len)
+ return -EINVAL;
+
+ /* Is the new length silly? */
+ if (vrm->new_len > TASK_SIZE)
+ return -EINVAL;
+
+ /* Remainder of checks are for cases with specific new_addr. */
+ if (!vrm_implies_new_addr(vrm))
+ return 0;
+
+ /* Is the new address silly? */
+ if (vrm->new_addr > TASK_SIZE - vrm->new_len)
+ return -EINVAL;
+
+ /* The new address must be page-aligned. */
+ if (offset_in_page(vrm->new_addr))
return -EINVAL;
- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+ /* A fixed address implies a move. */
+ if (!(flags & MREMAP_MAYMOVE))
return -EINVAL;
- /* Ensure the old/new locations do not overlap */
- if (addr + old_len > new_addr && new_addr + new_len > addr)
+ /* MREMAP_DONTUNMAP does not allow resizing in the process. */
+ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
+ return -EINVAL;
+
+ /* Target VMA must not overlap source VMA. */
+ if (vrm_overlaps(vrm))
return -EINVAL;
/*
@@ -966,73 +1813,142 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
* Check whether current map count plus 2 still leads us to 4 maps below
* the threshold, otherwise return -ENOMEM here to be more safe.
*/
- if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
+ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
return -ENOMEM;
- if (flags & MREMAP_FIXED) {
+ return 0;
+}
+
+static unsigned long remap_move(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma;
+ unsigned long start = vrm->addr;
+ unsigned long end = vrm->addr + vrm->old_len;
+ unsigned long new_addr = vrm->new_addr;
+ unsigned long target_addr = new_addr;
+ unsigned long res = -EFAULT;
+ unsigned long last_end;
+ bool seen_vma = false;
+
+ VMA_ITERATOR(vmi, current->mm, start);
+
+ /*
+ * When moving VMAs we allow for batched moves across multiple VMAs,
+ * with all VMAs in the input range [addr, addr + old_len) being moved
+ * (and split as necessary).
+ */
+ for_each_vma_range(vmi, vma, end) {
+ /* Account for start, end not aligned with VMA start, end. */
+ unsigned long addr = max(vma->vm_start, start);
+ unsigned long len = min(end, vma->vm_end) - addr;
+ unsigned long offset, res_vma;
+ bool multi_allowed;
+
+ /* No gap permitted at the start of the range. */
+ if (!seen_vma && start < vma->vm_start)
+ return -EFAULT;
+
/*
- * In mremap_to().
- * VMA is moved to dst address, and munmap dst first.
- * do_munmap will check if dst is sealed.
+ * To sensibly move multiple VMAs, accounting for the fact that
+ * get_unmapped_area() may align even MAP_FIXED moves, we simply
+ * attempt to move such that the gaps between source VMAs remain
+ * consistent in destination VMAs, e.g.:
+ *
+ * X Y X Y
+ * <---> <-> <---> <->
+ * |-------| |-----| |-----| |-------| |-----| |-----|
+ * | A | | B | | C | ---> | A' | | B' | | C' |
+ * |-------| |-----| |-----| |-------| |-----| |-----|
+ * new_addr
+ *
+ * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
*/
- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
- if (ret)
- return ret;
- }
+ offset = seen_vma ? vma->vm_start - last_end : 0;
+ last_end = vma->vm_end;
+
+ vrm->vma = vma;
+ vrm->addr = addr;
+ vrm->new_addr = target_addr + offset;
+ vrm->old_len = vrm->new_len = len;
+
+ multi_allowed = vma_multi_allowed(vma);
+ if (!multi_allowed) {
+ /* This is not the first VMA, abort immediately. */
+ if (seen_vma)
+ return -EFAULT;
+ /* This is the first, but there are more, abort. */
+ if (vma->vm_end < end)
+ return -EFAULT;
+ }
- if (old_len > new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
- if (ret)
- return ret;
- old_len = new_len;
- }
+ res_vma = check_prep_vma(vrm);
+ if (!res_vma)
+ res_vma = mremap_to(vrm);
+ if (IS_ERR_VALUE(res_vma))
+ return res_vma;
- vma = vma_lookup(mm, addr);
- if (!vma)
- return -EFAULT;
+ if (!seen_vma) {
+ VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr);
+ res = res_vma;
+ }
- ret = resize_is_valid(vma, addr, old_len, new_len, flags);
- if (ret)
- return ret;
+ /* mmap lock is only dropped on shrink. */
+ VM_WARN_ON_ONCE(!vrm->mmap_locked);
+ /* This is a move, no expand should occur. */
+ VM_WARN_ON_ONCE(vrm->populate_expand);
- /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
- if (flags & MREMAP_DONTUNMAP &&
- !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
- return -ENOMEM;
+ if (vrm->vmi_needs_invalidate) {
+ vma_iter_invalidate(&vmi);
+ vrm->vmi_needs_invalidate = false;
+ }
+ seen_vma = true;
+ target_addr = res_vma + vrm->new_len;
}
- if (flags & MREMAP_FIXED)
- map_flags |= MAP_FIXED;
+ return res;
+}
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
+static unsigned long do_mremap(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long res;
+ bool failed;
- ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
- ((addr - vma->vm_start) >> PAGE_SHIFT),
- map_flags);
- if (IS_ERR_VALUE(ret))
- return ret;
+ vrm->old_len = PAGE_ALIGN(vrm->old_len);
+ vrm->new_len = PAGE_ALIGN(vrm->new_len);
- /* We got a new mapping */
- if (!(flags & MREMAP_FIXED))
- new_addr = ret;
+ res = check_mremap_params(vrm);
+ if (res)
+ return res;
- return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags,
- uf, uf_unmap);
-}
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ vrm->mmap_locked = true;
-static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
-{
- unsigned long end = vma->vm_end + delta;
+ if (vrm_move_only(vrm)) {
+ res = remap_move(vrm);
+ } else {
+ vrm->vma = vma_lookup(current->mm, vrm->addr);
+ res = check_prep_vma(vrm);
+ if (res)
+ goto out;
- if (end < vma->vm_end) /* overflow */
- return 0;
- if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
- return 0;
- if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
- 0, MAP_FIXED) & ~PAGE_MASK)
- return 0;
- return 1;
+ /* Actually execute mremap. */
+ res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
+ }
+
+out:
+ failed = IS_ERR_VALUE(res);
+
+ if (vrm->mmap_locked)
+ mmap_write_unlock(mm);
+
+ /* VMA mlock'd + was expanded, so populated expanded region. */
+ if (!failed && vrm->populate_expand)
+ mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
+
+ notify_uffd(vrm, failed);
+ return res;
}
/*
@@ -1046,14 +1962,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long, new_len, unsigned long, flags,
unsigned long, new_addr)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret = -EINVAL;
- bool locked = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
-
/*
* There is a deliberate asymmetry here: we strip the pointer tag
* from the old address but leave the new address alone. This is
@@ -1065,185 +1976,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* See Documentation/arch/arm64/tagged-address-abi.rst for more
* information.
*/
- addr = untagged_addr(addr);
-
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
- return ret;
-
- if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
- return ret;
-
- /*
- * MREMAP_DONTUNMAP is always a move and it does not allow resizing
- * in the process.
- */
- if (flags & MREMAP_DONTUNMAP &&
- (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
- return ret;
-
-
- if (offset_in_page(addr))
- return ret;
-
- old_len = PAGE_ALIGN(old_len);
- new_len = PAGE_ALIGN(new_len);
-
- /*
- * We allow a zero old-len as a special case
- * for DOS-emu "duplicate shm area" thing. But
- * a zero new-len is nonsensical.
- */
- if (!new_len)
- return ret;
-
- if (mmap_write_lock_killable(current->mm))
- return -EINTR;
- vma = vma_lookup(mm, addr);
- if (!vma) {
- ret = -EFAULT;
- goto out;
- }
-
- /* Don't allow remapping vmas when they have already been sealed */
- if (!can_modify_vma(vma)) {
- ret = -EPERM;
- goto out;
- }
-
- if (is_vm_hugetlb_page(vma)) {
- struct hstate *h __maybe_unused = hstate_vma(vma);
-
- old_len = ALIGN(old_len, huge_page_size(h));
- new_len = ALIGN(new_len, huge_page_size(h));
-
- /* addrs must be huge page aligned */
- if (addr & ~huge_page_mask(h))
- goto out;
- if (new_addr & ~huge_page_mask(h))
- goto out;
-
- /*
- * Don't allow remap expansion, because the underlying hugetlb
- * reservation is not yet capable to handle split reservation.
- */
- if (new_len > old_len)
- goto out;
- }
-
- if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
- ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, flags, &uf, &uf_unmap_early,
- &uf_unmap);
- goto out;
- }
-
- /*
- * Always allow a shrinking remap: that just unmaps
- * the unnecessary pages..
- * do_vmi_munmap does all the needed commit accounting, and
- * unlocks the mmap_lock if so directed.
- */
- if (old_len >= new_len) {
- VMA_ITERATOR(vmi, mm, addr + new_len);
+ struct vma_remap_struct vrm = {
+ .addr = untagged_addr(addr),
+ .old_len = old_len,
+ .new_len = new_len,
+ .flags = flags,
+ .new_addr = new_addr,
- if (old_len == new_len) {
- ret = addr;
- goto out;
- }
+ .uf = &uf,
+ .uf_unmap_early = &uf_unmap_early,
+ .uf_unmap = &uf_unmap,
- ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
- &uf_unmap, true);
- if (ret)
- goto out;
-
- ret = addr;
- goto out_unlocked;
- }
+ .remap_type = MREMAP_INVALID, /* We set later. */
+ };
- /*
- * Ok, we need to grow..
- */
- ret = resize_is_valid(vma, addr, old_len, new_len, flags);
- if (ret)
- goto out;
-
- /* old_len exactly to the end of the area..
- */
- if (old_len == vma->vm_end - addr) {
- unsigned long delta = new_len - old_len;
-
- /* can we just expand the current mapping? */
- if (vma_expandable(vma, delta)) {
- long pages = delta >> PAGE_SHIFT;
- VMA_ITERATOR(vmi, mm, vma->vm_end);
- long charged = 0;
-
- if (vma->vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, pages)) {
- ret = -ENOMEM;
- goto out;
- }
- charged = pages;
- }
-
- /*
- * Function vma_merge_extend() is called on the
- * extension we are adding to the already existing vma,
- * vma_merge_extend() will merge this extension with the
- * already existing vma (expand operation itself) and
- * possibly also with the next vma if it becomes
- * adjacent to the expanded vma and otherwise
- * compatible.
- */
- vma = vma_merge_extend(&vmi, vma, delta);
- if (!vma) {
- vm_unacct_memory(charged);
- ret = -ENOMEM;
- goto out;
- }
-
- vm_stat_account(mm, vma->vm_flags, pages);
- if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
- locked = true;
- new_addr = addr;
- }
- ret = addr;
- goto out;
- }
- }
-
- /*
- * We weren't able to just expand or shrink the area,
- * we need to create a new one and move it..
- */
- ret = -ENOMEM;
- if (flags & MREMAP_MAYMOVE) {
- unsigned long map_flags = 0;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
-
- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
- vma->vm_pgoff +
- ((addr - vma->vm_start) >> PAGE_SHIFT),
- map_flags);
- if (IS_ERR_VALUE(new_addr)) {
- ret = new_addr;
- goto out;
- }
-
- ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, flags, &uf, &uf_unmap);
- }
-out:
- if (offset_in_page(ret))
- locked = false;
- mmap_write_unlock(current->mm);
- if (locked && new_len > old_len)
- mm_populate(new_addr + old_len, new_len - old_len);
-out_unlocked:
- userfaultfd_unmap_complete(mm, &uf_unmap_early);
- mremap_userfaultfd_complete(&uf, addr, ret, old_len);
- userfaultfd_unmap_complete(mm, &uf_unmap);
- return ret;
+ return do_mremap(&vrm);
}
diff --git a/mm/mseal.c b/mm/mseal.c
index 81d6e980e8a9..e5b205562d2e 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -11,148 +11,74 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"
-static inline void set_vma_sealed(struct vm_area_struct *vma)
-{
- vm_flags_set(vma, VM_SEALED);
-}
-
-static bool is_madv_discard(int behavior)
-{
- switch (behavior) {
- case MADV_FREE:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_REMOVE:
- case MADV_DONTFORK:
- case MADV_WIPEONFORK:
- case MADV_GUARD_INSTALL:
- return true;
- }
-
- return false;
-}
-
-static bool is_ro_anon(struct vm_area_struct *vma)
-{
- /* check anonymous mapping. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
- return false;
-
- /*
- * check for non-writable:
- * PROT=RO or PKRU is not writeable.
- */
- if (!(vma->vm_flags & VM_WRITE) ||
- !arch_vma_access_permitted(vma, true, false, false))
- return true;
-
- return false;
-}
-
/*
- * Check if a vma is allowed to be modified by madvise.
+ * mseal() disallows an input range which contain unmapped ranges (VMA holes).
+ *
+ * It disallows unmapped regions from start to end whether they exist at the
+ * start, in the middle, or at the end of the range, or any combination thereof.
+ *
+ * This is because after sealng a range, there's nothing to stop memory mapping
+ * of ranges in the remaining gaps later, meaning that the user might then
+ * wrongly consider the entirety of the mseal()'d range to be sealed when it
+ * in fact isn't.
*/
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
- if (!is_madv_discard(behavior))
- return true;
-
- if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
- return false;
-
- /* Allow by default. */
- return true;
-}
-
-static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, vm_flags_t newflags)
-{
- int ret = 0;
- vm_flags_t oldflags = vma->vm_flags;
-
- if (newflags == oldflags)
- goto out;
-
- vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto out;
- }
-
- set_vma_sealed(vma);
-out:
- *prev = vma;
- return ret;
-}
/*
- * Check for do_mseal:
- * 1> start is part of a valid vma.
- * 2> end is part of a valid vma.
- * 3> No gap (unallocated address) between start and end.
- * 4> map is sealable.
+ * Does the [start, end) range contain any unmapped memory?
+ *
+ * We ensure that:
+ * - start is part of a valid VMA.
+ * - end is part of a valid VMA.
+ * - no gap (unallocated memory) exists between start and end.
*/
-static int check_mm_seal(unsigned long start, unsigned long end)
+static bool range_contains_unmapped(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct vm_area_struct *vma;
- unsigned long nstart = start;
-
+ unsigned long prev_end = start;
VMA_ITERATOR(vmi, current->mm, start);
- /* going through each vma to check. */
for_each_vma_range(vmi, vma, end) {
- if (vma->vm_start > nstart)
- /* unallocated memory found. */
- return -ENOMEM;
-
- if (vma->vm_end >= end)
- return 0;
+ if (vma->vm_start > prev_end)
+ return true;
- nstart = vma->vm_end;
+ prev_end = vma->vm_end;
}
- return -ENOMEM;
+ return prev_end < end;
}
-/*
- * Apply sealing.
- */
-static int apply_mm_seal(unsigned long start, unsigned long end)
+static int mseal_apply(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
- unsigned long nstart;
struct vm_area_struct *vma, *prev;
+ unsigned long curr_start = start;
+ VMA_ITERATOR(vmi, mm, start);
- VMA_ITERATOR(vmi, current->mm, start);
-
+ /* We know there are no gaps so this will be non-NULL. */
vma = vma_iter_load(&vmi);
- /*
- * Note: check_mm_seal should already checked ENOMEM case.
- * so vma should not be null, same for the other ENOMEM cases.
- */
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- nstart = start;
for_each_vma_range(vmi, vma, end) {
- int error;
- unsigned long tmp;
- vm_flags_t newflags;
+ unsigned long curr_end = MIN(vma->vm_end, end);
- newflags = vma->vm_flags | VM_SEALED;
- tmp = vma->vm_end;
- if (tmp > end)
- tmp = end;
- error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
- if (error)
- return error;
- nstart = vma_iter_end(&vmi);
+ if (!(vma->vm_flags & VM_SEALED)) {
+ vma = vma_modify_flags(&vmi, prev, vma,
+ curr_start, curr_end,
+ vma->vm_flags | VM_SEALED);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+ vm_flags_set(vma, VM_SEALED);
+ }
+
+ prev = vma;
+ curr_start = curr_end;
}
return 0;
@@ -217,9 +143,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
unsigned long end;
struct mm_struct *mm = current->mm;
- ret = can_do_mseal(flags);
- if (ret)
- return ret;
+ /* Verify flags not set. */
+ if (flags)
+ return -EINVAL;
start = untagged_addr(start);
if (!PAGE_ALIGNED(start))
@@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- /*
- * First pass, this helps to avoid
- * partial sealing in case of error in input address range,
- * e.g. ENOMEM error.
- */
- ret = check_mm_seal(start, end);
- if (ret)
+ if (range_contains_unmapped(mm, start, end)) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Second pass, this should success, unless there are errors
@@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
* reaching the max supported VMAs, however, those cases shall
* be rare.
*/
- ret = apply_mm_seal(start, end);
+ ret = mseal_apply(mm, start, end);
out:
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 9cb6e99215e2..8b819fafd57b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,18 +42,11 @@
#include <asm/mmu_context.h>
#include "internal.h"
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-struct page *mem_map;
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
static struct kmem_cache *vm_region_jar;
@@ -207,7 +200,23 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return __vmalloc_noprof(size, gfp_mask);
+}
/*
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -392,8 +401,22 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
return mm->brk = brk;
}
+static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+
+static const struct ctl_table nommu_table[] = {
+ {
+ .procname = "nr_trim_pages",
+ .data = &sysctl_nr_trim_pages,
+ .maxlen = sizeof(sysctl_nr_trim_pages),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
/*
- * initialise the percpu counter for VM and region record slabs
+ * initialise the percpu counter for VM and region record slabs, initialise VMA
+ * state.
*/
void __init mmap_init(void)
{
@@ -402,6 +425,8 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
+ register_sysctl_init("vm", nommu_table);
+ vma_state_init();
}
/*
@@ -620,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
/*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- vma = vma_lookup(mm, addr);
- if (!vma)
- mmap_read_unlock(mm);
- return vma;
-}
-
-/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
@@ -710,7 +719,7 @@ static int validate_mmap_request(struct file *file,
if (file) {
/* files must support mmap */
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
/* work out if what we've got could possibly be shared
@@ -835,12 +844,12 @@ static int validate_mmap_request(struct file *file,
* we've determined that we can make the mapping, now translate what we
* now know into VMA flags
*/
-static unsigned long determine_vm_flags(struct file *file,
- unsigned long prot,
- unsigned long flags,
- unsigned long capabilities)
+static vm_flags_t determine_vm_flags(struct file *file,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long capabilities)
{
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
@@ -1191,7 +1200,7 @@ share:
setup_vma_to_mm(vma, current->mm);
current->mm->map_count++;
/* add the VMA to the tree */
- vma_iter_store(&vmi, vma);
+ vma_iter_store_new(&vmi, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1356,7 +1365,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
setup_vma_to_mm(vma, mm);
setup_vma_to_mm(new, mm);
- vma_iter_store(vmi, new);
+ vma_iter_store_new(vmi, new);
mm->map_count++;
return 0;
@@ -1701,6 +1710,85 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len)
+{
+ unsigned long addr_end;
+ struct vm_area_struct *vma;
+ int ret = -EFAULT;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return ret;
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ if (check_add_overflow(addr, len, &addr_end))
+ goto out;
+
+ /* don't overrun this mapping */
+ if (addr_end > vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read mappings where it is permitted */
+ if (vma->vm_flags & VM_MAYREAD) {
+ ret = strscpy(buf, (char *)addr, len);
+ if (ret < 0)
+ ret = len - 1;
+ }
+
+out:
+ mmap_read_unlock(mm);
+ return ret;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour (unused)
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
* @inode: The inode to check
@@ -1804,3 +1892,11 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
+
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ mmap_write_lock(oldmm);
+ dup_mm_exe_file(mm, oldmm);
+ mmap_write_unlock(oldmm);
+ return 0;
+}
diff --git a/mm/numa.c b/mm/numa.c
index e2eec07707d1..7d5e06fe5bd4 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -13,7 +13,6 @@ void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
- void *nd;
int tnid;
/* Allocate node data. Try node-local memory and then any node. */
@@ -21,7 +20,6 @@ void __init alloc_node_data(int nid)
if (!nd_pa)
panic("Cannot allocate %zu bytes for node %d data\n",
nd_size, nid);
- nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
@@ -30,20 +28,14 @@ void __init alloc_node_data(int nid)
if (tnid != nid)
pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
- node_data[nid] = nd;
+ node_data[nid] = __va(nd_pa);
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
}
void __init alloc_offline_node_data(int nid)
{
pg_data_t *pgdat;
-
- pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);
- if (!pgdat)
- panic("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
-
- node_data[nid] = pgdat;
+ node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES);
}
/* Stub functions: */
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 031fb9961bf7..703c8fa05048 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -8,11 +8,12 @@
#include <linux/memblock.h>
#include <linux/numa_memblks.h>
#include <asm/numa.h>
+#include <acpi/acpi_numa.h>
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-static int emu_nid_to_phys[MAX_NUMNODES];
+int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
int __init numa_emu_cmdline(char *str)
@@ -72,7 +73,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
}
printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
- nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M);
return 0;
}
@@ -263,7 +264,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
- size >> 20, min_size >> 20);
+ size / SZ_1M, min_size / SZ_1M);
size = min_size;
}
size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
@@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
int i, j, ret;
+ nodemask_t physnode_mask = numa_nodes_parsed;
if (!emu_cmdline)
goto no_emu;
@@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* split the system RAM into N fake nodes.
*/
if (strchr(emu_cmdline, 'U')) {
- nodemask_t physnode_mask = numa_nodes_parsed;
unsigned long n;
int nid = 0;
@@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
*/
max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
- /* commit */
- *numa_meminfo = ei;
-
/* Make sure numa_nodes_parsed only contains emulated nodes */
nodes_clear(numa_nodes_parsed);
for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
@@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
ei.blk[i].nid != NUMA_NO_NODE)
node_set(ei.blk[i].nid, numa_nodes_parsed);
- numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys));
+ /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision
+ * with faked numa nodes, particularly during later memory hotplug
+ * handling, and also update numa_nodes_parsed accordingly.
+ */
+ ret = fix_pxm_node_maps(max_emu_nid);
+ if (ret < 0)
+ goto no_emu;
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1);
/* make sure all emulated nodes are mapped to a physical node */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ for (i = 0; i < max_emu_nid + 1; i++)
if (emu_nid_to_phys[i] == NUMA_NO_NODE)
emu_nid_to_phys[i] = dfl_phys_nid;
@@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
numa_set_distance(i, j, dist);
}
}
+ for (i = 0; i < numa_distance_cnt; i++) {
+ for (j = 0; j < numa_distance_cnt; j++) {
+ int physi, physj;
+ u8 dist;
+
+ /* distance between fake nodes is already ok */
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE &&
+ emu_nid_to_phys[j] != NUMA_NO_NODE)
+ continue;
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE)
+ physi = emu_nid_to_phys[i];
+ else
+ physi = i - max_emu_nid;
+ if (emu_nid_to_phys[j] != NUMA_NO_NODE)
+ physj = emu_nid_to_phys[j];
+ else
+ physj = j - max_emu_nid;
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+ numa_set_distance(i, j, dist);
+ }
+ }
/* free the copied physical distance table */
memblock_free(phys_dist, phys_size);
return;
no_emu:
+ numa_nodes_parsed = physnode_mask;
/* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
emu_nid_to_phys[i] = i;
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index a3877e9bc878..5b009a9cd8b4 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -7,7 +7,7 @@
#include <linux/numa.h>
#include <linux/numa_memblks.h>
-static int numa_distance_cnt;
+int numa_distance_cnt;
static u8 *numa_distance;
nodemask_t numa_nodes_parsed __initdata;
@@ -76,7 +76,7 @@ static int __init numa_alloc_distance(void)
for (j = 0; j < cnt; j++)
numa_distance[i * cnt + j] = i == j ?
LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+ pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt);
return 0;
}
@@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
}
/**
+ * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the numa_reserved_meminfo.
+ *
+ * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
+ * against memblock_type information and moves any that intersect reserved
+ * ranges to numa_reserved_meminfo. However, when that information is known
+ * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
+ * to numa_reserved_meminfo directly.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo);
+}
+
+/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
@@ -405,9 +427,9 @@ static int __init numa_register_meminfo(struct numa_meminfo *mi)
unsigned long pfn_align = node_map_pfn_alignment();
if (pfn_align && pfn_align < PAGES_PER_SECTION) {
- unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20;
+ unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M;
- unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20;
+ unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M;
pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
node_align_mb, sect_align_mb);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 044ebab2c941..25923cfec9c6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -563,7 +563,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
}
/*
- * Reaps the address space of the give task.
+ * Reaps the address space of the given task.
*
* Returns true on success and false if none or part of the address space
* has been reclaimed and the caller should retry later.
@@ -705,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk)
}
#ifdef CONFIG_SYSCTL
-static struct ctl_table vm_oom_kill_table[] = {
+static const struct ctl_table vm_oom_kill_table[] = {
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d9861e42b2bd..3e248d1c3969 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -41,6 +41,7 @@
#include <trace/events/writeback.h>
#include "internal.h"
+#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -120,29 +121,6 @@ EXPORT_SYMBOL(laptop_mode);
struct wb_domain global_wb_domain;
-/* consolidated parameters for balance_dirty_pages() and its subroutines */
-struct dirty_throttle_control {
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct wb_domain *dom;
- struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
-#endif
- struct bdi_writeback *wb;
- struct fprop_local_percpu *wb_completions;
-
- unsigned long avail; /* dirtyable */
- unsigned long dirty; /* file_dirty + write + nfs */
- unsigned long thresh; /* dirty threshold */
- unsigned long bg_thresh; /* dirty background threshold */
-
- unsigned long wb_dirty; /* per-wb counterparts */
- unsigned long wb_thresh;
- unsigned long wb_bg_thresh;
-
- unsigned long pos_ratio;
- bool freerun;
- bool dirty_exceeded;
-};
-
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
@@ -543,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
@@ -630,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
*/
static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = from_timer(dom, t, period_timer);
+ struct wb_domain *dom = timer_container_of(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -663,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -942,26 +920,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > wb_max_thresh)
- wb_thresh = wb_max_thresh;
/*
- * With strictlimit flag, the wb_thresh is treated as
- * a hard limit in balance_dirty_pages() and wb_position_ratio().
- * It's possible that wb_thresh is close to zero, not because
- * the device is slow, but because it has been inactive.
- * To prevent occasional writes from being blocked, we raise wb_thresh.
+ * It's very possible that wb_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
*/
- if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
- u64 wb_scale_thresh = 0;
-
- if (limit > dtc->dirty)
- wb_scale_thresh = (limit - dtc->dirty) / 100;
- wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4));
+ if (thresh > dtc->dirty) {
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
+ else
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
}
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
+
return wb_thresh;
}
@@ -969,6 +946,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ domain_dirty_avail(&gdtc, true);
return __wb_calc_thresh(&gdtc, thresh);
}
@@ -1095,7 +1073,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
- unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
@@ -1123,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
- * 1% by default. Without strictlimit feature, fuse writeback may
- * consume arbitrary amount of RAM because it is accounted in
- * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ * 1% by default.
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
@@ -1145,12 +1121,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
- if (dtc->wb_dirty < 8) {
- dtc->pos_ratio = min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
- return;
- }
-
if (dtc->wb_dirty >= wb_thresh)
return;
@@ -1222,14 +1192,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
- * It's very possible that wb_thresh is close to 0 not because the
- * device is slow, but that it has remained inactive for long time.
- * Honour such devices a reasonable good (hopefully IO efficient)
- * threshold, so that the occasional writes won't be blocked and active
- * writes can rampup the threshold quickly.
- */
- wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
- /*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
@@ -1484,17 +1446,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
- *
- * We rampup dirty_ratelimit forcibly if wb_dirty is low because
- * it's possible that wb_thresh is close to zero due to inactivity
- * of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
- if (dtc->wb_dirty < 8)
- setpoint = dtc->wb_dirty + 1;
- else
- setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
@@ -1983,11 +1938,7 @@ free_running:
*/
if (pause < min_pause) {
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -2012,11 +1963,7 @@ free_running:
pause:
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -2254,7 +2201,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
- from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+ timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
@@ -2281,7 +2228,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
@@ -2319,7 +2266,7 @@ static int page_writeback_cpu_online(unsigned int cpu)
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-static struct ctl_table vm_page_writeback_sysctls[] = {
+static const struct ctl_table vm_page_writeback_sysctls[] = {
{
.procname = "dirty_background_ratio",
.data = &dirty_background_ratio,
@@ -2616,11 +2563,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
@@ -2673,27 +2620,6 @@ int write_cache_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(write_cache_pages);
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2704,14 +2630,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
@@ -3130,6 +3053,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
int access_ret;
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0f9b359d67bb..09241bb7663e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -273,6 +276,7 @@ int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
+int defrag_mode;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -286,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static bool cond_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -348,81 +353,225 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
+static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
+{
+ return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+}
+
+static __always_inline void
+get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
+ unsigned long **bitmap_word, unsigned long *bitidx)
+{
+ unsigned long *bitmap;
+ unsigned long word_bitidx;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
+#else
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+#endif
+ BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ *bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = *bitidx / BITS_PER_LONG;
+ *bitidx &= (BITS_PER_LONG - 1);
+ *bitmap_word = &bitmap[word_bitidx];
+}
+
+
/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * __get_pfnblock_flags_mask - Return the requested group of flags for
+ * a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
+static unsigned long __get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn,
+ unsigned long mask)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
unsigned long word;
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
- * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
return (word >> bitidx) & mask;
}
-static __always_inline int get_pfnblock_migratetype(const struct page *page,
- unsigned long pfn)
+/**
+ * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to check
+ *
+ * Return: true if the bit is set, otherwise false
+ */
+bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
{
- return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return false;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ return test_bit(bitidx + pb_bit, bitmap_word);
}
/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * get_pfnblock_migratetype - Return the migratetype of a pageblock
* @page: The page within the block of interest
- * @flags: The flags to set
* @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
+ *
+ * Return: The migratetype of the pageblock
+ *
+ * Use get_pfnblock_migratetype() if caller already has both @page and @pfn
+ * to save a call to page_to_pfn().
*/
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long mask)
+__always_inline enum migratetype
+get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
+ unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
+ unsigned long flags;
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
- BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+ flags = __get_pfnblock_flags_mask(page, pfn, mask);
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (flags & BIT(PB_migrate_isolate))
+ return MIGRATE_ISOLATE;
+#endif
+ return flags & MIGRATETYPE_MASK;
+}
- VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+/**
+ * __set_pfnblock_flags_mask - Set the requested group of flags for
+ * a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @flags: The flags to set
+ * @mask: mask of bits that the caller is interested in
+ */
+static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long flags, unsigned long mask)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+ unsigned long word;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
mask <<= bitidx;
flags <<= bitidx;
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
do {
- } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
+ } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+}
+
+/**
+ * set_pfnblock_bit - Set a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to set
+ */
+void set_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ set_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * clear_pfnblock_bit - Clear a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to clear
+ */
+void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ clear_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * set_pageblock_migratetype - Set the migratetype of a pageblock
+ * @page: The page within the block of interest
+ * @migratetype: migratetype to set
+ */
+static void set_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype)
+{
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_UNMOVABLE;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(1,
+ "Use set_pageblock_isolate() for pageblock isolation");
+ return;
+ }
+ VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
+ PB_migrate_isolate),
+ "Use clear_pageblock_isolate() to unisolate pageblock");
+ /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page),
+ (unsigned long)migratetype,
+ MIGRATETYPE_AND_ISO_MASK);
}
-void set_pageblock_migratetype(struct page *page, int migratetype)
+void __meminit init_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype,
+ bool isolate)
{
+ unsigned long flags;
+
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pfnblock_flags_mask(page, (unsigned long)migratetype,
- page_to_pfn(page), MIGRATETYPE_MASK);
+ flags = migratetype;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(
+ 1,
+ "Set isolate=true to isolate pageblock with a migratetype");
+ return;
+ }
+ if (isolate)
+ flags |= BIT(PB_migrate_isolate);
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
+ MIGRATETYPE_AND_ISO_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -508,9 +657,9 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- bool __maybe_unused movable;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
@@ -614,6 +763,10 @@ compaction_capture(struct capture_control *capc, struct page *page,
capc->cc->migratetype != MIGRATE_MOVABLE)
return false;
+ if (migratetype != capc->cc->migratetype)
+ trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
+ capc->cc->migratetype, migratetype);
+
capc->page = page;
return true;
}
@@ -655,16 +808,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
bool tail)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
else
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
/*
@@ -676,24 +833,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), old_mt, 1 << order);
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
- account_freepages(zone, -(1 << order), old_mt);
- account_freepages(zone, 1 << order, new_mt);
+ account_freepages(zone, -nr_pages, old_mt);
+ account_freepages(zone, nr_pages, new_mt);
+
+ if (order >= pageblock_order &&
+ is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
+ if (!is_migrate_isolate(old_mt))
+ nr_pages = -nr_pages;
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
+ }
}
static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
+ int nr_pages = 1 << order;
+
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
if (page_reported(page))
@@ -703,6 +870,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -872,9 +1042,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
+ page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -901,26 +1069,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
-static void free_page_is_bad_report(struct page *page)
-{
- bad_page(page,
- page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
-}
-
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
- free_page_is_bad_report(page);
+ bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}
@@ -947,21 +1107,34 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
- if (unlikely(folio_entire_mapcount(folio))) {
- bad_page(page, "nonzero entire_mapcount");
- goto out;
- }
if (unlikely(folio_large_mapcount(folio))) {
bad_page(page, "nonzero large_mapcount");
goto out;
}
- if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
+ unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
- if (unlikely(atomic_read(&folio->_pincount))) {
- bad_page(page, "nonzero pincount");
- goto out;
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+ bad_page(page, "nonzero mm mapcount 0");
+ goto out;
+ }
+ if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+ bad_page(page, "nonzero mm mapcount 1");
+ goto out;
+ }
+ }
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
}
break;
case 2:
@@ -970,7 +1143,22 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "on deferred list");
goto out;
}
+ if (!IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
+ }
break;
+ case 3:
+ /* the third tail page: hugetlb specifics overlap ->mappings */
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
+ break;
+ fallthrough;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
@@ -1041,6 +1229,79 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_sub(page, nr);
+}
+
+/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+{
+ if (tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
@@ -1096,8 +1357,12 @@ __always_inline bool free_pages_prepare(struct page *page,
if (unlikely(order)) {
int i;
- if (compound)
+ if (compound) {
page[1].flags &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
+ }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1110,11 +1375,14 @@ __always_inline bool free_pages_prepare(struct page *page,
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageMappingFlags(page)) {
- if (PageAnon(page))
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- page->mapping = NULL;
+ if (folio_test_anon(folio)) {
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+ folio->mapping = NULL;
}
+ if (unlikely(page_has_type(page)))
+ /* Reset the page_type (which overlays _mapcount) */
+ page->page_type = UINT_MAX;
+
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
@@ -1249,13 +1517,45 @@ static void split_large_buddy(struct zone *zone, struct page *page,
} while (1);
}
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->order = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->order;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1295,12 +1595,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order,
set_page_count(p, 0);
}
- /*
- * Freeing the page with debug_pagealloc enabled will try to
- * unmap it; some archs don't like double-unmappings, so
- * map it first.
- */
- debug_pagealloc_map_pages(page, nr_pages);
adjust_managed_page_count(page, nr_pages);
} else {
for (loop = 0; loop < nr_pages; loop++, p++) {
@@ -1433,7 +1727,7 @@ static __always_inline void page_del_and_expand(struct zone *zone,
static void check_new_page_bad(struct page *page)
{
- if (unlikely(page->flags & __PG_HWPOISON)) {
+ if (unlikely(PageHWPoison(page))) {
/* Don't complain about hwpoisoned pages */
if (PageBuddy(page))
__ClearPageBuddy(page);
@@ -1508,7 +1802,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
int i;
set_page_private(page, 0);
- set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
@@ -1635,8 +1928,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Change the type of a block and move all its free pages to that
- * type's freelist.
+ * Move all free pages of a block to new type's freelist. Caller needs to
+ * change the block type.
*/
static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
int old_mt, int new_mt)
@@ -1668,8 +1961,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
pages_moved += 1 << order;
}
- set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
-
return pages_moved;
}
@@ -1714,7 +2005,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
- if (PageLRU(page) || __PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
(*num_movable)++;
pfn++;
}
@@ -1727,11 +2018,16 @@ static int move_freepages_block(struct zone *zone, struct page *page,
int old_mt, int new_mt)
{
unsigned long start_pfn;
+ int res;
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return -1;
- return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
+ return res;
+
}
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1759,11 +2055,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
return start_pfn;
}
+static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
+{
+ if (isolate)
+ set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ else
+ clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+}
+
/**
- * move_freepages_block_isolate - move free pages in block for page isolation
+ * __move_freepages_block_isolate - move free pages in block for page isolation
* @zone: the zone
* @page: the pageblock page
- * @migratetype: migratetype to set on the pageblock
+ * @isolate: to isolate the given pageblock or unisolate it
*
* This is similar to move_freepages_block(), but handles the special
* case encountered in page isolation, where the block of interest
@@ -1778,10 +2082,18 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
*
* Returns %true if pages could be moved, %false otherwise.
*/
-bool move_freepages_block_isolate(struct zone *zone, struct page *page,
- int migratetype)
+static bool __move_freepages_block_isolate(struct zone *zone,
+ struct page *page, bool isolate)
{
unsigned long start_pfn, pfn;
+ int from_mt;
+ int to_mt;
+
+ if (isolate == get_pageblock_isolate(page)) {
+ VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
+ isolate ? "Isolate" : "Unisolate");
+ return false;
+ }
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return false;
@@ -1798,7 +2110,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
del_page_from_free_list(buddy, zone, order,
get_pfnblock_migratetype(buddy, pfn));
- set_pageblock_migratetype(page, migratetype);
+ toggle_pageblock_isolate(page, isolate);
split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
return true;
}
@@ -1809,16 +2121,38 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
del_page_from_free_list(page, zone, order,
get_pfnblock_migratetype(page, pfn));
- set_pageblock_migratetype(page, migratetype);
+ toggle_pageblock_isolate(page, isolate);
split_large_buddy(zone, page, pfn, order, FPI_NONE);
return true;
}
move:
- __move_freepages_block(zone, start_pfn,
- get_pfnblock_migratetype(page, start_pfn),
- migratetype);
+ /* Use MIGRATETYPE_MASK to get non-isolate migratetype */
+ if (isolate) {
+ from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ to_mt = MIGRATE_ISOLATE;
+ } else {
+ from_mt = MIGRATE_ISOLATE;
+ to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ }
+
+ __move_freepages_block(zone, start_pfn, from_mt, to_mt);
+ toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
+
return true;
}
+
+bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, true);
+}
+
+bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, false);
+}
+
#endif /* CONFIG_MEMORY_ISOLATION */
static void change_pageblock_range(struct page *pageblock_page,
@@ -1832,39 +2166,6 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}
-/*
- * When we are falling back to another migratetype during allocation, try to
- * steal extra free pages from the same pageblocks to satisfy further
- * allocations, instead of polluting multiple pageblocks.
- *
- * If we are stealing a relatively large buddy page, it is likely there will
- * be more free pages in the pageblock, so try to steal them all. For
- * reclaimable and unmovable allocations, we steal regardless of page size,
- * as fragmentation caused by those allocations polluting movable pageblocks
- * is worse than movable allocations stealing from unmovable and reclaimable
- * pageblocks.
- */
-static bool can_steal_fallback(unsigned int order, int start_mt)
-{
- /*
- * Leaving this order check is intended, although there is
- * relaxed order check in next check. The reason is that
- * we can actually steal whole pageblock if this condition met,
- * but, below check doesn't guarantee it and that is just heuristic
- * so could be changed anytime.
- */
- if (order >= pageblock_order)
- return true;
-
- if (order >= pageblock_order / 2 ||
- start_mt == MIGRATE_RECLAIMABLE ||
- start_mt == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled)
- return true;
-
- return false;
-}
-
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
@@ -1903,30 +2204,93 @@ static inline bool boost_watermark(struct zone *zone)
}
/*
- * This function implements actual steal behaviour. If order is large enough, we
- * can claim the whole pageblock for the requested migratetype. If not, we check
- * the pageblock for constituent pages; if at least half of the pages are free
- * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * When we are falling back to another migratetype during allocation, should we
+ * try to claim an entire block to satisfy further allocations, instead of
+ * polluting multiple pageblocks?
*/
-static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
- int current_order, int order, int start_type,
- unsigned int alloc_flags, bool whole_block)
+static bool should_try_claim_block(unsigned int order, int start_mt)
{
- int free_pages, movable_pages, alike_pages;
- unsigned long start_pfn;
- int block_type;
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually claim the whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ /*
+ * Above a certain threshold, always try to claim, as it's likely there
+ * will be more free pages in the pageblock.
+ */
+ if (order >= pageblock_order / 2)
+ return true;
+
+ /*
+ * Unmovable/reclaimable allocations would cause permanent
+ * fragmentations if they fell back to allocating from a movable block
+ * (polluting it), so we try to claim the whole block regardless of the
+ * allocation size. Later movable allocations can always steal from this
+ * block, which is less problematic.
+ */
+ if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
+ return true;
- block_type = get_pageblock_migratetype(page);
+ if (page_group_by_mobility_disabled)
+ return true;
/*
- * This can happen due to races and we want to prevent broken
- * highatomic accounting.
+ * Movable pages won't cause permanent fragmentation, so when you alloc
+ * small pages, we just need to temporarily steal unmovable or
+ * reclaimable pages that are closest to the request size. After a
+ * while, memory compaction may occur to form large contiguous pages,
+ * and the next movable allocation may not need to steal.
*/
- if (is_migrate_highatomic(block_type))
- goto single_page;
+ return false;
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If claimable is true, this function returns fallback_mt only if
+ * we would do this whole-block claiming. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool claimable)
+{
+ int i;
+
+ if (claimable && !should_try_claim_block(order, migratetype))
+ return -2;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
+ int fallback_mt = fallbacks[migratetype][i];
+
+ if (!free_area_empty(area, fallback_mt))
+ return fallback_mt;
+ }
+
+ return -1;
+}
+
+/*
+ * This function implements actual block claiming behaviour. If order is large
+ * enough, we can claim the whole pageblock for the requested migratetype. If
+ * not, we check the pageblock for constituent pages; if at least half of the
+ * pages are free or compatible, we can still claim the whole block, so pages
+ * freed in the future will be put on the correct free list.
+ */
+static struct page *
+try_to_claim_block(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ int block_type, unsigned int alloc_flags)
+{
+ int free_pages, movable_pages, alike_pages;
+ unsigned long start_pfn;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1947,14 +2311,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
- /* We are not allowed to try stealing from the whole block */
- if (!whole_block)
- goto single_page;
-
/* moving whole block can fail due to zone boundary conditions */
if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
&movable_pages))
- goto single_page;
+ return NULL;
/*
* Determine how many pages are compatible with our allocation.
@@ -1984,204 +2344,23 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled) {
__move_freepages_block(zone, start_pfn, block_type, start_type);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
return __rmqueue_smallest(zone, order, start_type);
}
-single_page:
- page_del_and_expand(zone, page, order, current_order, block_type);
- return page;
-}
-
-/*
- * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
- * fragmentation due to mixed migratetype pages in one pageblock.
- */
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal)
-{
- int i;
- int fallback_mt;
-
- if (area->nr_free == 0)
- return -1;
-
- *can_steal = false;
- for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
-
- if (can_steal_fallback(order, migratetype))
- *can_steal = true;
-
- if (!only_stealable)
- return fallback_mt;
-
- if (*can_steal)
- return fallback_mt;
- }
-
- return -1;
-}
-
-/*
- * Reserve the pageblock(s) surrounding an allocation request for
- * exclusive use of high-order atomic allocations if there are no
- * empty page blocks that contain a page with a suitable order
- */
-static void reserve_highatomic_pageblock(struct page *page, int order,
- struct zone *zone)
-{
- int mt;
- unsigned long max_managed, flags;
-
- /*
- * The number reserved as: minimum is 1 pageblock, maximum is
- * roughly 1% of a zone. But if 1% of a zone falls below a
- * pageblock size, then don't reserve any pageblocks.
- * Check is race-prone but harmless.
- */
- if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
- return;
- max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
- if (zone->nr_reserved_highatomic >= max_managed)
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- /* Recheck the nr_reserved_highatomic limit under the lock */
- if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
-
- /* Yoink! */
- mt = get_pageblock_migratetype(page);
- /* Only reserve normal pageblocks (i.e., they can merge with others) */
- if (!migratetype_is_mergeable(mt))
- goto out_unlock;
-
- if (order < pageblock_order) {
- if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
- zone->nr_reserved_highatomic += pageblock_nr_pages;
- } else {
- change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
- zone->nr_reserved_highatomic += 1 << order;
- }
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
-}
-
-/*
- * Used when an allocation is about to fail under memory pressure. This
- * potentially hurts the reliability of high-order allocations when under
- * intense memory pressure but failed atomic allocations should be easier
- * to recover from than an OOM.
- *
- * If @force is true, try to unreserve pageblocks even though highatomic
- * pageblock is exhausted.
- */
-static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
- bool force)
-{
- struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
- struct zoneref *z;
- struct zone *zone;
- struct page *page;
- int order;
- int ret;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
- ac->nodemask) {
- /*
- * Preserve at least one pageblock unless memory pressure
- * is really high.
- */
- if (!force && zone->nr_reserved_highatomic <=
- pageblock_nr_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct free_area *area = &(zone->free_area[order]);
- int mt;
-
- page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
- if (!page)
- continue;
-
- mt = get_pageblock_migratetype(page);
- /*
- * In page freeing path, migratetype change is racy so
- * we can counter several free pages in a pageblock
- * in this loop although we changed the pageblock type
- * from highatomic to ac->migratetype. So we should
- * adjust the count once.
- */
- if (is_migrate_highatomic(mt)) {
- unsigned long size;
- /*
- * It should never happen but changes to
- * locking could inadvertently allow a per-cpu
- * drain to add pages to MIGRATE_HIGHATOMIC
- * while unreserving so be safe and watch for
- * underflows.
- */
- size = max(pageblock_nr_pages, 1UL << order);
- size = min(size, zone->nr_reserved_highatomic);
- zone->nr_reserved_highatomic -= size;
- }
-
- /*
- * Convert to ac->migratetype and avoid the normal
- * pageblock stealing heuristics. Minimally, the caller
- * is doing the work and needs the pages. More
- * importantly, if the block was always converted to
- * MIGRATE_UNMOVABLE or another type then the number
- * of pageblocks that cannot be completely freed
- * may increase.
- */
- if (order < pageblock_order)
- ret = move_freepages_block(zone, page, mt,
- ac->migratetype);
- else {
- move_to_free_list(page, zone, order, mt,
- ac->migratetype);
- change_pageblock_range(page, order,
- ac->migratetype);
- ret = 1;
- }
- /*
- * Reserving the block(s) already succeeded,
- * so this should not fail on zone boundaries.
- */
- WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-
- return false;
+ return NULL;
}
/*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2189,7 +2368,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2208,62 +2386,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, true);
+
+ /* No block in that order */
if (fallback_mt == -1)
continue;
- /*
- * We cannot steal all free pages from the pageblock and the
- * requested migratetype is movable. In that case it's better to
- * steal and split the smallest available page instead of the
- * largest available page, because even if the next movable
- * allocation falls back into a different pageblock than this
- * one, it won't cause permanent fragmentation.
- */
- if (!can_steal && start_migratetype == MIGRATE_MOVABLE
- && current_order > order)
- goto find_smallest;
+ /* Advanced into orders too low to claim, abort */
+ if (fallback_mt == -2)
+ break;
- goto do_steal;
+ page = get_page_from_free_area(area, fallback_mt);
+ page = try_to_claim_block(zone, page, current_order, order,
+ start_migratetype, fallback_mt,
+ alloc_flags);
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
-find_smallest:
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
- if (fallback_mt != -1)
- break;
- }
-
- /*
- * This should not happen - we already found a suitable fallback
- * when looking for the largest page.
- */
- VM_BUG_ON(current_order > MAX_PAGE_ORDER);
-
-do_steal:
- page = get_page_from_free_area(area, fallback_mt);
-
- /* take off list, maybe claim block, expand remainder */
- page = steal_suitable_fallback(zone, page, current_order, order,
- start_migratetype, alloc_flags, can_steal);
+ start_migratetype, false);
+ if (fallback_mt == -1)
+ continue;
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ page = get_page_from_free_area(area, fallback_mt);
+ page_del_and_expand(zone, page, order, current_order, fallback_mt);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
- return page;
+ return NULL;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2282,16 +2471,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2303,13 +2524,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
+ return 0;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2592,9 +2819,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
- struct page *page, int migratetype,
- unsigned int order)
+static void free_frozen_page_commit(struct zone *zone,
+ struct per_cpu_pages *pcp, struct page *page, int migratetype,
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@@ -2619,16 +2846,24 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
- free_high = (pcp->free_count >= batch &&
+ free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(batch)));
+ pcp->count >= batch));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return;
+ }
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2643,7 +2878,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
/*
* Free a pcp page
*/
-void free_unref_page(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2652,7 +2888,7 @@ void free_unref_page(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2666,27 +2902,37 @@ void free_unref_page(struct page *page, unsigned int order)
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
+ zone = page_zone(page);
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- zone = page_zone(page);
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
+}
+
/*
* Free a batch of folios
*/
@@ -2743,7 +2989,7 @@ void free_unref_folios(struct folio_batch *folios)
/*
* Free isolated pages directly to the
- * allocator, see comment in free_unref_page.
+ * allocator, see comment in free_frozen_pages.
*/
if (is_migrate_isolate(migratetype)) {
free_one_page(zone, &folio->page, pfn,
@@ -2774,8 +3020,8 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_unref_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
+ order, FPI_NONE);
}
if (pcp) {
@@ -2804,7 +3050,7 @@ void split_page(struct page *page, unsigned int order)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2906,11 +3152,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
+ return NULL;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
@@ -3059,7 +3312,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
/*
* Do not instrument rmqueue() with KMSAN. This function may call
- * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * __msan_poison_alloca() through a call to set_pfnblock_migratetype().
* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
* may call rmqueue() again, which will result in a deadlock.
*/
@@ -3094,6 +3347,142 @@ out:
return page;
}
+/*
+ * Reserve the pageblock(s) surrounding an allocation request for
+ * exclusive use of high-order atomic allocations if there are no
+ * empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, int order,
+ struct zone *zone)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * The number reserved as: minimum is 1 pageblock, maximum is
+ * roughly 1% of a zone. But if 1% of a zone falls below a
+ * pageblock size, then don't reserve any pageblocks.
+ * Check is race-prone but harmless.
+ */
+ if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+ return;
+ max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (!migratetype_is_mergeable(mt))
+ goto out_unlock;
+
+ if (order < pageblock_order) {
+ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
+ goto out_unlock;
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ } else {
+ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
+ zone->nr_reserved_highatomic += 1 << order;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve pageblocks even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+ int ret;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ ac->nodemask) {
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+ unsigned long size;
+
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ if (!page)
+ continue;
+
+ size = max(pageblock_nr_pages, 1UL << order);
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
+ size = zone->nr_reserved_highatomic;
+ zone->nr_reserved_highatomic -= size;
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ if (order < pageblock_order)
+ ret = move_freepages_block(zone, page,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ else {
+ move_to_free_list(page, zone, order,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ change_pageblock_range(page, order,
+ ac->migratetype);
+ ret = 1;
+ }
+ /*
+ * Reserving the block(s) already succeeded,
+ * so this should not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(ret == -1);
+ if (ret > 0) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return false;
+}
+
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
@@ -3251,18 +3640,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -3297,6 +3674,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
+ if (defrag_mode) {
+ alloc_flags |= ALLOC_NOFRAGMENT;
+ return alloc_flags;
+ }
+
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
@@ -3346,7 +3728,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+ * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -3388,7 +3770,7 @@ retry:
continue;
}
- if (no_fallback && nr_online_nodes > 1 &&
+ if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
zone != zonelist_zone(ac->preferred_zoneref)) {
int local_nid;
@@ -3404,7 +3786,7 @@ retry:
}
}
- cond_accept_memory(zone, order);
+ cond_accept_memory(zone, order, alloc_flags);
/*
* Detect whether the number of free pages is below high
@@ -3431,7 +3813,7 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/*
@@ -3484,7 +3866,7 @@ try_this_zone:
return page;
} else {
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/* Try again if zone has deferred pages */
@@ -3499,7 +3881,7 @@ try_this_zone:
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
- if (no_fallback) {
+ if (no_fallback && !defrag_mode) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
@@ -3567,7 +3949,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);
-
return page;
}
@@ -3979,15 +4360,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
+ unsigned int reclaim_order;
+
+ if (defrag_mode)
+ reclaim_order = max(order, pageblock_order);
+ else
+ reclaim_order = order;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (!managed_zone(zone))
continue;
- if (last_pgdat != zone->zone_pgdat) {
- wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
- }
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
+ last_pgdat = zone->zone_pgdat;
}
}
@@ -4021,14 +4408,14 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
/*
* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
* GFP_ATOMIC) rather than fail, see the comment for
- * cpuset_node_allowed().
+ * cpuset_current_node_allowed().
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
@@ -4037,6 +4424,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+ if (defrag_mode)
+ alloc_flags |= ALLOC_NOFRAGMENT;
+
return alloc_flags;
}
@@ -4346,6 +4736,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4419,6 +4817,11 @@ retry:
&compaction_retries))
goto retry;
+ /* Reclaim/compaction failed to prevent the fallback */
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
/*
* Deal with possible cpuset update races or zonelist updates to avoid
@@ -4512,7 +4915,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4532,28 +4940,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
* @gfp: GFP flags for the allocation
* @preferred_nid: The preferred NUMA node ID to allocate from
* @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list or array
- * @page_list: Optional list to store the allocated pages
- * @page_array: Optional array to store the pages
+ * @nr_pages: The number of pages desired in the array
+ * @page_array: Array to store the pages
*
* This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to page_list if page_list
- * is not NULL, otherwise it is assumed that the page_array is valid.
+ * allocate nr_pages quickly. Pages are added to the page_array.
*
- * For lists, nr_pages is the number of pages that should be allocated.
- *
- * For arrays, only NULL elements are populated with pages and nr_pages
+ * Note that only NULL elements are populated with pages and nr_pages
* is the maximum number of pages that will be stored in the array.
*
- * Returns the number of pages on the list or array.
+ * Returns the number of pages in the array.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array)
{
struct page *page;
@@ -4571,7 +4974,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ while (nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
/* No pages requested? */
@@ -4579,7 +4982,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto out;
/* Already populated array? */
- if (unlikely(page_array && nr_pages - nr_populated == 0))
+ if (unlikely(nr_pages - nr_populated == 0))
goto out;
/* Bulk allocator does not support memcg accounting. */
@@ -4624,7 +5027,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto failed;
}
- cond_accept_memory(zone, 0);
+ cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
@@ -4633,7 +5036,7 @@ retry_this_zone:
break;
}
- if (cond_accept_memory(zone, 0))
+ if (cond_accept_memory(zone, 0, alloc_flags))
goto retry_this_zone;
/* Try again if zone has deferred pages */
@@ -4661,7 +5064,7 @@ retry_this_zone:
while (nr_populated < nr_pages) {
/* Skip existing pages */
- if (page_array && page_array[nr_populated]) {
+ if (page_array[nr_populated]) {
nr_populated++;
continue;
}
@@ -4679,11 +5082,8 @@ retry_this_zone:
nr_account++;
prep_new_page(page, 0, gfp, 0);
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
+ set_page_refcounted(page);
+ page_array[nr_populated++] = page;
}
pcp_spin_unlock(pcp);
@@ -4700,14 +5100,8 @@ failed_irq:
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- if (page) {
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
- }
-
+ if (page)
+ page_array[nr_populated++] = page;
goto out;
}
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
@@ -4715,8 +5109,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
- int preferred_nid, nodemask_t *nodemask)
+struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4769,7 +5163,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
- __free_pages(page, order);
+ free_frozen_pages(page, order);
page = NULL;
}
@@ -4778,6 +5172,18 @@ out:
return page;
}
+EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
+
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
+{
+ struct page *page;
+
+ page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
EXPORT_SYMBOL(__alloc_pages_noprof);
struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
@@ -4811,6 +5217,24 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page_noprof);
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
+{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
+ struct alloc_tag *tag = pgalloc_tag_get(page);
+
+ if (put_page_testzero(page))
+ __free_frozen_pages(page, order, fpi_flags);
+ else if (!head) {
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ while (order-- > 0)
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
+ }
+}
+
/**
* __free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
@@ -4833,20 +5257,19 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
*/
void __free_pages(struct page *page, unsigned int order)
{
- /* get PageHead before we drop reference */
- int head = PageHead(page);
- struct alloc_tag *tag = pgalloc_tag_get(page);
-
- if (put_page_testzero(page))
- free_unref_page(page, order);
- else if (!head) {
- pgalloc_tag_sub_pages(tag, (1 << order) - 1);
- while (order-- > 0)
- free_unref_page(page + (1 << order), order);
- }
+ ___free_pages(page, order, FPI_NONE);
}
EXPORT_SYMBOL(__free_pages);
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from alloc_pages_nolock)
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_TRYLOCK);
+}
+
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -4867,7 +5290,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
while (page < --last)
set_page_refcounted(last);
@@ -5162,13 +5585,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
zonerefs->zone_idx = 0;
}
-/*
- * Build zonelists ordered by zone and nodes within zones.
- * This results in conserving DMA zone[s] until all Normal memory is
- * exhausted, but results in overflowing to remote node while memory
- * may still exist in local DMA zone.
- */
-
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
@@ -5837,6 +6253,7 @@ static void calculate_totalreserve_pages(void)
}
}
totalreserve_pages = reserve_pages;
+ trace_mm_calculate_totalreserve_pages(totalreserve_pages);
}
/*
@@ -5866,6 +6283,8 @@ static void setup_per_zone_lowmem_reserve(void)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
+ trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
+ zone->lowmem_reserve[j]);
}
}
}
@@ -5929,6 +6348,7 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
+ trace_mm_setup_per_zone_wmarks(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6175,7 +6595,7 @@ out:
return ret;
}
-static struct ctl_table page_alloc_sysctl_table[] = {
+static const struct ctl_table page_alloc_sysctl_table[] = {
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -6202,6 +6622,15 @@ static struct ctl_table page_alloc_sysctl_table[] = {
.extra2 = SYSCTL_THREE_THOUSAND,
},
{
+ .procname = "defrag_mode",
+ .data = &defrag_mode,
+ .maxlen = sizeof(defrag_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
@@ -6265,14 +6694,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
}
}
-/*
- * [start, end) must belong to a single zone.
- * @migratetype: using migratetype to filter the type of migration in
- * trace_mm_alloc_contig_migrate_range_info.
- */
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype)
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6281,13 +6705,9 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
int ret = 0;
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
- struct page *page;
- unsigned long total_mapped = 0;
- unsigned long total_migrated = 0;
- unsigned long total_reclaimed = 0;
lru_cache_disable();
@@ -6313,22 +6733,9 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
- total_reclaimed += nr_reclaimed;
- list_for_each_entry(page, &cc->migratepages, lru) {
- struct folio *folio = page_folio(page);
-
- total_mapped += folio_mapped(folio) *
- folio_nr_pages(folio);
- }
- }
-
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
- if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
- total_migrated += cc->nr_migratepages;
-
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
* to retry again over this error, so do the same here.
@@ -6344,14 +6751,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
putback_movable_pages(&cc->migratepages);
}
- trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
- total_migrated,
- total_reclaimed,
- total_mapped);
return (ret < 0) ? ret : 0;
}
-static void split_free_pages(struct list_head *list)
+static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
{
int order;
@@ -6362,7 +6765,8 @@ static void split_free_pages(struct list_head *list)
list_for_each_entry_safe(page, next, &list[order], lru) {
int i;
- post_alloc_hook(page, order, __GFP_MOVABLE);
+ post_alloc_hook(page, order, gfp_mask);
+ set_page_refcounted(page);
if (!order)
continue;
@@ -6376,15 +6780,48 @@ static void split_free_pages(struct list_head *list)
}
}
+static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
+{
+ const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO;
+ const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+
+ /*
+ * We are given the range to allocate; node, mobility and placement
+ * hints are irrelevant at this point. We'll simply ignore them.
+ */
+ gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE |
+ __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE);
+
+ /*
+ * We only support most reclaim flags (but not NOFAIL/NORETRY), and
+ * selected action flags.
+ */
+ if (gfp_mask & ~(reclaim_mask | action_mask))
+ return -EINVAL;
+
+ /*
+ * Flags to control page compaction/migration/reclaim, to free up our
+ * page range. Migratable pages are movable, __GFP_MOVABLE is implied
+ * for them.
+ *
+ * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
+ * to not degrade callers.
+ */
+ *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) |
+ __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+ return 0;
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlying pageblocks (either
- * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
- * in range must have the same migratetype and it must
- * be either of the two.
- * @gfp_mask: GFP mask to use during compaction
+ * @alloc_flags: allocation information
+ * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
*
* The PFN range does not have to be pageblock aligned. The PFN range must
* belong to a single zone.
@@ -6398,7 +6835,7 @@ static void split_free_pages(struct list_head *list)
* need to be freed with free_contig_range().
*/
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- unsigned migratetype, gfp_t gfp_mask)
+ acr_flags_t alloc_flags, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
int ret = 0;
@@ -6410,10 +6847,16 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
- .gfp_mask = current_gfp_context(gfp_mask),
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
+ PB_ISOLATE_MODE_CMA_ALLOC :
+ PB_ISOLATE_MODE_OTHER;
+
+ gfp_mask = current_gfp_context(gfp_mask);
+ if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
+ return -EINVAL;
/*
* What we do here is we mark all pageblocks in range as
@@ -6436,7 +6879,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
+ ret = start_isolate_page_range(start, end, mode);
if (ret)
goto done;
@@ -6452,10 +6895,20 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
+ ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
- ret = 0;
+
+ /*
+ * When in-use hugetlb pages are migrated, they may simply be released
+ * back into the free hugepage pool instead of being returned to the
+ * buddy system. After the migration of in-use huge pages is completed,
+ * we will invoke replace_free_hugepage_folios() to ensure that these
+ * hugepages are properly released to the buddy system.
+ */
+ ret = replace_free_hugepage_folios(start, end);
+ if (ret)
+ goto done;
/*
* Pages from [start, end) are within a pageblock_nr_pages
@@ -6476,7 +6929,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
outer_start = find_large_buddy(start);
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, 0)) {
+ if (test_pages_isolated(outer_start, end, mode)) {
ret = -EBUSY;
goto done;
}
@@ -6489,7 +6942,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
}
if (!(gfp_mask & __GFP_COMP)) {
- split_free_pages(cc.freepages);
+ split_free_pages(cc.freepages, gfp_mask);
/* Free head and tail (if any) */
if (start != outer_start)
@@ -6502,13 +6955,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
+ set_page_refcounted(head);
} else {
ret = -EINVAL;
WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
start, end, outer_start, outer_end);
}
done:
- undo_isolate_page_range(start, end, migratetype);
+ undo_isolate_page_range(start, end);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range_noprof);
@@ -6518,8 +6972,8 @@ static int __alloc_contig_pages(unsigned long start_pfn,
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
+ return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
+ gfp_mask);
}
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
@@ -6556,7 +7010,9 @@ static bool zone_spans_last_pfn(const struct zone *zone,
/**
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
* @nr_pages: Number of contiguous pages to allocate
- * @gfp_mask: GFP mask to limit search and used during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
@@ -6873,9 +7329,6 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
@@ -6902,11 +7355,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -6915,9 +7364,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
}
void accept_page(struct page *page)
@@ -6954,24 +7400,31 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
+ if (list_empty(&zone->unaccepted_pages))
return false;
- if (list_empty(&zone->unaccepted_pages))
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
@@ -6990,22 +7443,17 @@ static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
@@ -7016,7 +7464,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7028,3 +7477,93 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+/**
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
+ */
+struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that alloc_pages_nolock()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
+ | __GFP_ACCOUNT;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (page)
+ set_page_refcounted(page);
+
+ if (memcg_kmem_online() && page &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ free_pages_nolock(page, order);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b249d15af9dd..661e0f2a5127 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -121,6 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
{
struct page_counter *c;
bool protection = track_protection(counter);
+ bool track_failcnt = counter->track_failcnt;
for (c = counter; c; c = c->parent) {
long new;
@@ -146,7 +147,8 @@ bool page_counter_try_charge(struct page_counter *counter,
* inaccuracy in the failcnt which is only used
* to report stats.
*/
- data_race(c->failcnt++);
+ if (track_failcnt)
+ data_race(c->failcnt++);
*fail = c;
goto failed;
}
@@ -288,7 +290,7 @@ int page_counter_memparse(const char *buf, const char *max,
}
-#ifdef CONFIG_MEMCG
+#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
* This function calculates an individual page counter's effective
* protection which is derived from its own memory.min/low, its
@@ -460,4 +462,4 @@ void page_counter_calculate_protection(struct page_counter *root,
atomic_long_read(&parent->children_low_usage),
recursive_protection));
}
-#endif /* CONFIG_MEMCG */
+#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 641d93f6af4c..d7396a8970e5 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -369,25 +369,15 @@ static void __invalidate_page_ext(unsigned long pfn)
}
static int __meminit online_page_ext(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
+ unsigned long nr_pages)
{
+ int nid = pfn_to_nid(start_pfn);
unsigned long start, end, pfn;
int fail = 0;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == NUMA_NO_NODE) {
- /*
- * In this case, "nid" already exists and contains valid memory.
- * "start_pfn" passed to us is a pfn which is an arg for
- * online__pages(), and start_pfn should exist.
- */
- nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_online(nid));
- }
-
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
fail = init_section_page_ext(pfn, nid);
if (!fail)
@@ -435,8 +425,7 @@ static int __meminit page_ext_callback(struct notifier_block *self,
switch (action) {
case MEM_GOING_ONLINE:
- ret = online_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ ret = online_page_ext(mn->start_pfn, mn->nr_pages);
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
@@ -508,6 +497,19 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
#endif
/**
+ * page_ext_lookup() - Lookup a page extension for a PFN.
+ * @pfn: PFN of the page we're interested in.
+ *
+ * Must be called with RCU read lock taken and @pfn must be valid.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ */
+struct page_ext *page_ext_lookup(unsigned long pfn)
+{
+ return lookup_page_ext(pfn_to_page(pfn));
+}
+
+/**
* page_ext_get() - Get the extended information for a page.
* @page: The page we're interested in.
*
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 3f7a203d35c6..d2423f30577e 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
- free_unref_page(page, compound_order(page));
+ free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -138,7 +138,7 @@ refill:
goto refill;
if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
- free_unref_page(page,
+ free_frozen_pages(page,
encoded_page_decode_order(encoded_page));
goto refill;
}
@@ -166,6 +166,6 @@ void page_frag_free(void *addr)
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- free_unref_page(page, compound_order(page));
+ free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 41ea77f22011..a82b340dc204 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -62,9 +62,14 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
/*
* For PTE-mapped THP, one sub page is referenced,
* the whole THP is referenced.
+ *
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
*/
- if (ptep_clear_young_notify(vma, addr, pvmw.pte))
- referenced = true;
+ if (likely(pte_present(ptep_get(pvmw.pte))))
+ referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte);
+ referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
referenced = true;
@@ -112,7 +117,7 @@ static void page_idle_clear_pte_refs(struct folio *folio)
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
@@ -157,7 +162,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
}
static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
@@ -193,11 +198,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
return (char *)in - buf;
}
-static struct bin_attribute page_idle_bitmap_attr =
+static const struct bin_attribute page_idle_bitmap_attr =
__BIN_ATTR(bitmap, 0600,
page_idle_bitmap_read, page_idle_bitmap_write, 0);
-static struct bin_attribute *page_idle_bin_attrs[] = {
+static const struct bin_attribute *const page_idle_bin_attrs[] = {
&page_idle_bitmap_attr,
NULL,
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 4b4ea8e49cf6..a2056a5ecb13 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -163,7 +163,6 @@ reprobe:
page_no = 1; /* force Empty message */
sis->max = page_no;
sis->pages = page_no - 1;
- sis->highest_bit = page_no - 1;
out:
return ret;
bad_bmap:
@@ -238,15 +237,13 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct folio *folio = page_folio(page);
- int ret;
+ int ret = 0;
+
+ if (folio_free_swap(folio))
+ goto out_unlock;
- if (folio_free_swap(folio)) {
- folio_unlock(folio);
- return 0;
- }
/*
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
@@ -254,8 +251,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
ret = arch_prepare_to_swap(folio);
if (ret) {
folio_mark_dirty(folio);
- folio_unlock(folio);
- return ret;
+ goto out_unlock;
}
/*
@@ -266,28 +262,30 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
*/
if (is_folio_zero_filled(folio)) {
swap_zeromap_folio_set(folio);
- folio_unlock(folio);
- return 0;
- } else {
- /*
- * Clear bits this folio occupies in the zeromap to prevent
- * zero data being read in from any previous zero writes that
- * occupied the same swap entries.
- */
- swap_zeromap_folio_clear(folio);
+ goto out_unlock;
}
+
+ /*
+ * Clear bits this folio occupies in the zeromap to prevent zero data
+ * being read in from any previous zero writes that occupied the same
+ * swap entries.
+ */
+ swap_zeromap_folio_clear(folio);
+
if (zswap_store(folio)) {
count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
- folio_unlock(folio);
- return 0;
+ goto out_unlock;
}
if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE;
}
- __swap_writepage(folio, wbc);
+ __swap_writepage(folio, swap_plug);
return 0;
+out_unlock:
+ folio_unlock(folio);
+ return ret;
}
static inline void count_swpout_vm_event(struct folio *folio)
@@ -373,9 +371,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
-static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc)
+static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct swap_iocb *sio = NULL;
+ struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
struct swap_info_struct *sis = swp_swap_info(folio->swap);
struct file *swap_file = sis->swap_file;
loff_t pos = swap_dev_pos(folio->swap);
@@ -383,8 +381,6 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc
count_swpout_vm_event(folio);
folio_start_writeback(folio);
folio_unlock(folio);
- if (wbc->swap_plug)
- sio = *wbc->swap_plug;
if (sio) {
if (sio->iocb.ki_filp != swap_file ||
sio->iocb.ki_pos + sio->len != pos) {
@@ -403,22 +399,21 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc
bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
swap_write_unplug(sio);
sio = NULL;
}
- if (wbc->swap_plug)
- *wbc->swap_plug = sio;
+ if (swap_plug)
+ *swap_plug = sio;
}
static void swap_writepage_bdev_sync(struct folio *folio,
- struct writeback_control *wbc, struct swap_info_struct *sis)
+ struct swap_info_struct *sis)
{
struct bio_vec bv;
struct bio bio;
- bio_init(&bio, sis->bdev, &bv, 1,
- REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
+ bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
bio.bi_iter.bi_sector = swap_folio_sector(folio);
bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
@@ -433,13 +428,11 @@ static void swap_writepage_bdev_sync(struct folio *folio,
}
static void swap_writepage_bdev_async(struct folio *folio,
- struct writeback_control *wbc, struct swap_info_struct *sis)
+ struct swap_info_struct *sis)
{
struct bio *bio;
- bio = bio_alloc(sis->bdev, 1,
- REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
- GFP_NOIO);
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
bio->bi_iter.bi_sector = swap_folio_sector(folio);
bio->bi_end_io = end_swap_bio_write;
bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
@@ -451,7 +444,7 @@ static void swap_writepage_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
+void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
{
struct swap_info_struct *sis = swp_swap_info(folio->swap);
@@ -462,16 +455,16 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
* is safe.
*/
if (data_race(sis->flags & SWP_FS_OPS))
- swap_writepage_fs(folio, wbc);
+ swap_writepage_fs(folio, swap_plug);
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
* is safe.
*/
else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
- swap_writepage_bdev_sync(folio, wbc, sis);
+ swap_writepage_bdev_sync(folio, sis);
else
- swap_writepage_bdev_async(folio, wbc, sis);
+ swap_writepage_bdev_async(folio, sis);
}
void swap_write_unplug(struct swap_iocb *sio)
@@ -639,11 +632,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
if (swap_read_folio_zeromap(folio)) {
folio_unlock(folio);
goto finish;
- } else if (zswap_load(folio)) {
- folio_unlock(folio);
- goto finish;
}
+ if (zswap_load(folio) != -ENOENT)
+ goto finish;
+
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7e04047977cf..f72b6cd38b95 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -21,9 +21,9 @@
* consequently belong to a single zone.
*
* PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. Similarly, pages
+ * with movable_ops can only be identified some time after they were
+ * allocated. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
* dereference that page (e.g., dumping), it has to make sure that it
@@ -31,7 +31,7 @@
*
*/
static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags)
+ enum pb_isolate_mode mode)
{
struct page *page = pfn_to_page(start_pfn);
struct zone *zone = page_zone(page);
@@ -46,7 +46,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* isolate CMA pageblocks even when they are not movable in fact
* so consider them movable here.
*/
- if (is_migrate_cma(migratetype))
+ if (mode == PB_ISOLATE_MODE_CMA_ALLOC)
return NULL;
return page;
@@ -83,9 +83,16 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(folio_hstate(folio)))
+ struct hstate *h;
+
+ /*
+ * The huge page may be freed so can not
+ * use folio_hstate() directly.
+ */
+ h = size_to_hstate(folio_size(folio));
+ if (h && !hugepage_migration_supported(h))
return page;
- } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
+ } else if (!folio_test_lru(folio)) {
return page;
}
@@ -110,7 +117,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page))
continue;
/*
@@ -123,10 +130,10 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* move these pages that still have a reference count > 0.
* (false negatives in this function only)
*/
- if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
continue;
- if (__PageMovable(page) || PageLRU(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
continue;
/*
@@ -144,7 +151,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* present in [start_pfn, end_pfn). The pageblock must intersect with
* [start_pfn, end_pfn).
*/
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
unsigned long start_pfn, unsigned long end_pfn)
{
struct zone *zone = page_zone(page);
@@ -179,9 +186,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
end_pfn);
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
- migratetype, isol_flags);
+ mode);
if (!unmovable) {
- if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
+ if (!pageblock_isolate_and_move_free_pages(zone, page)) {
spin_unlock_irqrestore(&zone->lock, flags);
return -EBUSY;
}
@@ -191,7 +198,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
}
spin_unlock_irqrestore(&zone->lock, flags);
- if (isol_flags & REPORT_FAILURE) {
+ if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
/*
* printk() with zone->lock held will likely trigger a
* lockdep splat, so defer it here.
@@ -202,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
return -EBUSY;
}
-static void unset_migratetype_isolate(struct page *page, int migratetype)
+static void unset_migratetype_isolate(struct page *page)
{
struct zone *zone;
unsigned long flags;
@@ -255,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
* Isolating this block already succeeded, so this
* should not fail on zone boundaries.
*/
- WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+ WARN_ON_ONCE(!pageblock_unisolate_and_move_free_pages(zone, page));
} else {
- set_pageblock_migratetype(page, migratetype);
- __putback_isolated_page(page, order, migratetype);
+ clear_pageblock_isolate(page);
+ __putback_isolated_page(page, order, get_pageblock_migratetype(page));
}
zone->nr_isolate_pageblock--;
out:
@@ -285,12 +292,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* isolate_single_pageblock() -- tries to isolate a pageblock that might be
* within a free or in-use page.
* @boundary_pfn: pageblock-aligned pfn that a page might cross
- * @flags: isolation flags
- * @gfp_flags: GFP flags used for migrating pages
+ * @mode: isolation mode
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
- * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -305,9 +310,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* either. The function handles this by splitting the free page or migrating
* the in-use page then splitting the free page.
*/
-static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
- int migratetype)
+static int isolate_single_pageblock(unsigned long boundary_pfn,
+ enum pb_isolate_mode mode, bool isolate_before,
+ bool skip_isolation)
{
unsigned long start_pfn;
unsigned long isolate_pageblock;
@@ -333,12 +338,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
if (skip_isolation) {
- int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
-
- VM_BUG_ON(!is_migrate_isolate(mt));
+ VM_BUG_ON(!get_pageblock_isolate(pfn_to_page(isolate_pageblock)));
} else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
- flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock),
+ mode, isolate_pageblock,
+ isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -378,7 +382,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
- /* move_freepages_block_isolate() handled this */
+ /* pageblock_isolate_and_move_free_pages() handled this */
VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
pfn += 1UL << order;
@@ -417,7 +421,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
* proper free and split handling for them.
*/
VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
- VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+ VM_WARN_ON_ONCE_PAGE(page_has_movable_ops(page), page);
goto failed;
}
@@ -428,7 +432,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock));
return -EBUSY;
}
@@ -436,16 +440,7 @@ failed:
* start_isolate_page_range() - mark page range MIGRATE_ISOLATE
* @start_pfn: The first PFN of the range to be isolated.
* @end_pfn: The last PFN of the range to be isolated.
- * @migratetype: Migrate type to set in error recovery.
- * @flags: The following flags are allowed (they can be combined in
- * a bit mask)
- * MEMORY_OFFLINE - isolate to offline (!allocate) memory
- * e.g., skip over PageHWPoison() pages
- * and PageOffline() pages.
- * REPORT_FAILURE - report details about the failure to
- * isolate the range
- * @gfp_flags: GFP flags used for migrating pages that sit across the
- * range boundaries.
+ * @mode: isolation mode
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -478,7 +473,7 @@ failed:
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags, gfp_t gfp_flags)
+ enum pb_isolate_mode mode)
{
unsigned long pfn;
struct page *page;
@@ -489,8 +484,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
- skip_isolation, migratetype);
+ ret = isolate_single_pageblock(isolate_start, mode, false,
+ skip_isolation);
if (ret)
return ret;
@@ -498,10 +493,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
- skip_isolation, migratetype);
+ ret = isolate_single_pageblock(isolate_end, mode, true, skip_isolation);
if (ret) {
- unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ unset_migratetype_isolate(pfn_to_page(isolate_start));
return ret;
}
@@ -510,12 +504,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < isolate_end - pageblock_nr_pages;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && set_migratetype_isolate(page, migratetype, flags,
- start_pfn, end_pfn)) {
- undo_isolate_page_range(isolate_start, pfn, migratetype);
+ if (page && set_migratetype_isolate(page, mode, start_pfn,
+ end_pfn)) {
+ undo_isolate_page_range(isolate_start, pfn);
unset_migratetype_isolate(
- pfn_to_page(isolate_end - pageblock_nr_pages),
- migratetype);
+ pfn_to_page(isolate_end - pageblock_nr_pages));
return -EBUSY;
}
}
@@ -526,13 +519,10 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* undo_isolate_page_range - undo effects of start_isolate_page_range()
* @start_pfn: The first PFN of the isolated range
* @end_pfn: The last PFN of the isolated range
- * @migratetype: New migrate type to set on the range
*
- * This finds every MIGRATE_ISOLATE page block in the given range
- * and switches it to @migratetype.
+ * This finds and unsets every MIGRATE_ISOLATE page block in the given range
*/
-void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype)
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
@@ -545,7 +535,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
page = __first_valid_page(pfn, pageblock_nr_pages);
if (!page || !is_migrate_isolate_page(page))
continue;
- unset_migratetype_isolate(page, migratetype);
+ unset_migratetype_isolate(page);
}
}
/*
@@ -557,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
- int flags)
+ enum pb_isolate_mode mode)
{
struct page *page;
@@ -570,11 +560,12 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* simple way to verify that as VM_BUG_ON(), though.
*/
pfn += 1 << buddy_order(page);
- else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+ PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
- !page_count(page))
+ else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+ PageOffline(page) && !page_count(page))
/*
* The responsible driver agreed to skip PageOffline()
* pages when offlining memory by dropping its
@@ -592,11 +583,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* test_pages_isolated - check if pageblocks in range are isolated
* @start_pfn: The first PFN of the isolated range
* @end_pfn: The first PFN *after* the isolated range
- * @isol_flags: Testing mode flags
+ * @mode: Testing mode
*
* This tests if all in the specified range are free.
*
- * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * If %PB_ISOLATE_MODE_MEM_OFFLINE specified in @mode, it will consider
* poisoned and offlined pages free as well.
*
* Caller must ensure the requested range doesn't span zones.
@@ -604,7 +595,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* Returns 0 if true, -EBUSY if one or more pages are in use.
*/
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- int isol_flags)
+ enum pb_isolate_mode mode)
{
unsigned long pfn, flags;
struct page *page;
@@ -612,6 +603,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
@@ -630,7 +631,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, mode);
spin_unlock_irqrestore(&zone->lock, flags);
ret = pfn < end_pfn ? -EBUSY : 0;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 2d6360eaccbb..589ec37c94aa 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -168,6 +168,9 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
unsigned long flags;
struct stack *stack;
+ if (!gfpflags_allow_spinning(gfp_mask))
+ return;
+
set_current_in_page_owner();
stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask));
if (!stack) {
@@ -229,17 +232,19 @@ static void dec_stack_record_count(depot_stack_handle_t handle,
handle);
}
-static inline void __update_page_owner_handle(struct page_ext *page_ext,
+static inline void __update_page_owner_handle(struct page *page,
depot_stack_handle_t handle,
unsigned short order,
gfp_t gfp_mask,
short last_migrate_reason, u64 ts_nsec,
pid_t pid, pid_t tgid, char *comm)
{
- int i;
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- for (i = 0; i < (1 << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
page_owner->handle = handle;
page_owner->order = order;
@@ -252,20 +257,22 @@ static inline void __update_page_owner_handle(struct page_ext *page_ext,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
- page_ext = page_ext_next(page_ext);
}
+ rcu_read_unlock();
}
-static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
+static inline void __update_page_owner_free_handle(struct page *page,
depot_stack_handle_t handle,
unsigned short order,
pid_t pid, pid_t tgid,
u64 free_ts_nsec)
{
- int i;
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- for (i = 0; i < (1 << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
/* Only __reset_page_owner() wants to clear the bit */
if (handle) {
@@ -275,8 +282,8 @@ static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
page_owner->free_ts_nsec = free_ts_nsec;
page_owner->free_pid = current->pid;
page_owner->free_tgid = current->tgid;
- page_ext = page_ext_next(page_ext);
}
+ rcu_read_unlock();
}
void __reset_page_owner(struct page *page, unsigned short order)
@@ -293,11 +300,17 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner = get_page_owner(page_ext);
alloc_handle = page_owner->handle;
+ page_ext_put(page_ext);
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
- __update_page_owner_free_handle(page_ext, handle, order, current->pid,
+ /*
+ * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
+ * to prevent issues in stack_depot_save().
+ * This is similar to alloc_pages_nolock() gfp flags, but only used
+ * to signal stack_depot to avoid spin_locks.
+ */
+ handle = save_stack(__GFP_NOWARN);
+ __update_page_owner_free_handle(page, handle, order, current->pid,
current->tgid, free_ts_nsec);
- page_ext_put(page_ext);
if (alloc_handle != early_handle)
/*
@@ -313,25 +326,19 @@ void __reset_page_owner(struct page *page, unsigned short order)
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext;
u64 ts_nsec = local_clock();
depot_stack_handle_t handle;
handle = save_stack(gfp_mask);
-
- page_ext = page_ext_get(page);
- if (unlikely(!page_ext))
- return;
- __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
+ __update_page_owner_handle(page, handle, order, gfp_mask, -1,
ts_nsec, current->pid, current->tgid,
current->comm);
- page_ext_put(page_ext);
inc_stack_record_count(handle, gfp_mask, 1 << order);
}
-void __set_page_owner_migrate_reason(struct page *page, int reason)
+void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
{
- struct page_ext *page_ext = page_ext_get(page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -344,44 +351,42 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
void __split_page_owner(struct page *page, int old_order, int new_order)
{
- int i;
- struct page_ext *page_ext = page_ext_get(page);
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- if (unlikely(!page_ext))
- return;
-
- for (i = 0; i < (1 << old_order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << old_order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
page_owner->order = new_order;
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- int i;
- struct page_ext *old_ext;
- struct page_ext *new_ext;
+ struct page_ext *page_ext;
+ struct page_ext_iter iter;
struct page_owner *old_page_owner;
struct page_owner *new_page_owner;
depot_stack_handle_t migrate_handle;
- old_ext = page_ext_get(&old->page);
- if (unlikely(!old_ext))
+ page_ext = page_ext_get(&old->page);
+ if (unlikely(!page_ext))
return;
- new_ext = page_ext_get(&newfolio->page);
- if (unlikely(!new_ext)) {
- page_ext_put(old_ext);
+ old_page_owner = get_page_owner(page_ext);
+ page_ext_put(page_ext);
+
+ page_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!page_ext))
return;
- }
- old_page_owner = get_page_owner(old_ext);
- new_page_owner = get_page_owner(new_ext);
+ new_page_owner = get_page_owner(page_ext);
+ page_ext_put(page_ext);
+
migrate_handle = new_page_owner->handle;
- __update_page_owner_handle(new_ext, old_page_owner->handle,
+ __update_page_owner_handle(&newfolio->page, old_page_owner->handle,
old_page_owner->order, old_page_owner->gfp_mask,
old_page_owner->last_migrate_reason,
old_page_owner->ts_nsec, old_page_owner->pid,
@@ -391,7 +396,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
* will be freed after migration. Keep them until then as they may be
* useful.
*/
- __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
+ __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order,
old_page_owner->free_pid,
old_page_owner->free_tgid,
old_page_owner->free_ts_nsec);
@@ -400,14 +405,12 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
* for the new one and the old folio otherwise there will be an imbalance
* when subtracting those pages from the stack.
*/
- for (i = 0; i < (1 << new_page_owner->order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) {
+ old_page_owner = get_page_owner(page_ext);
old_page_owner->handle = migrate_handle;
- old_ext = page_ext_next(old_ext);
- old_page_owner = get_page_owner(old_ext);
}
-
- page_ext_put(new_ext);
- page_ext_put(old_ext);
+ rcu_read_unlock();
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -507,7 +510,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
rcu_read_lock();
memcg_data = READ_ONCE(page->memcg_data);
- if (!memcg_data)
+ if (!memcg_data || PageTail(page))
goto out_unlock;
if (memcg_data & MEMCG_DATA_OBJEXTS)
@@ -813,7 +816,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
goto ext_put_continue;
/* Found early allocated page */
- __update_page_owner_handle(page_ext, early_handle, 0, 0,
+ __update_page_owner_handle(page, early_handle, 0, 0,
-1, local_clock(), current->pid,
current->tgid, current->comm);
count++;
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 509c6ef8de40..4eeca782b888 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -62,24 +62,20 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
*/
static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
struct page *page;
- unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
BUG_ON(PageSlab(page));
anon = PageAnon(page);
- for (i = 0; i < pgcnt; i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, pgcnt, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
@@ -89,9 +85,8 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
}
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
/*
@@ -102,24 +97,20 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
bool rw)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
struct page *page;
- unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
BUG_ON(PageSlab(page));
anon = PageAnon(page);
- for (i = 0; i < pgcnt; i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, pgcnt, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
@@ -129,9 +120,8 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
}
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
/*
@@ -140,24 +130,19 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
- unsigned long i;
BUG_ON(PageSlab(page));
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
- for (i = 0; i < (1ul << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_read(&ptc->file_map_count));
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
@@ -196,9 +181,8 @@ EXPORT_SYMBOL(__page_table_check_pud_clear);
/* Whether the swap entry cached writable information */
static inline bool swap_cached_writable(swp_entry_t entry)
{
- return is_writable_device_exclusive_entry(entry) ||
- is_writable_device_private_entry(entry) ||
- is_writable_migration_entry(entry);
+ return is_writable_device_private_entry(entry) ||
+ is_writable_migration_entry(entry);
}
static inline void page_table_check_pte_flags(pte_t pte)
@@ -234,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}
-void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
+ unsigned int nr)
{
+ unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
page_table_check_pmd_flags(pmd);
- __page_table_check_pmd_clear(mm, *pmdp);
- if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
- pmd_write(pmd));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pmd_clear(mm, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd))
+ page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
-EXPORT_SYMBOL(__page_table_check_pmd_set);
+EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
+void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
+ unsigned int nr)
{
+ unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, *pudp);
- if (pud_user_accessible_page(pud)) {
- page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
- pud_write(pud));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pud_clear(mm, *(pudp + i));
+ if (pud_user_accessible_page(pud))
+ page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
-EXPORT_SYMBOL(__page_table_check_pud_set);
+EXPORT_SYMBOL(__page_table_check_puds_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 81839a9e74f1..e981a1a292d2 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -84,6 +84,7 @@ again:
* mapped at the @pvmw->pte
* @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
* for checking
+ * @pte_nr: the number of small pages described by @pvmw->pte.
*
* page_vma_mapped_walk() found a place where pfn range is *potentially*
* mapped. check_pte() has to validate this.
@@ -100,7 +101,7 @@ again:
* Otherwise, return false.
*
*/
-static bool check_pte(struct page_vma_mapped_walk *pvmw)
+static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
{
unsigned long pfn;
pte_t ptent = ptep_get(pvmw->pte);
@@ -111,8 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return false;
entry = pte_to_swp_entry(ptent);
- if (!is_migration_entry(entry) &&
- !is_device_exclusive_entry(entry))
+ if (!is_migration_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
@@ -133,7 +133,11 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(ptent);
}
- return (pfn - pvmw->pfn) < pvmw->nr_pages;
+ if ((pfn + pte_nr - 1) < pvmw->pfn)
+ return false;
+ if (pfn > (pvmw->pfn + pvmw->nr_pages - 1))
+ return false;
+ return true;
}
/* Returns true if the two ranges overlap. Careful to not overflow. */
@@ -208,7 +212,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return false;
pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
- if (!check_pte(pvmw))
+ if (!check_pte(pvmw, pages_per_huge_page(hstate)))
return not_found(pvmw);
return true;
}
@@ -242,8 +246,7 @@ restart:
*/
pmde = pmdp_get_lockless(pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
- (pmd_present(pmde) && pmd_devmap(pmde))) {
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
if (!pmd_present(pmde)) {
@@ -258,7 +261,7 @@ restart:
return not_found(pvmw);
return true;
}
- if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+ if (likely(pmd_trans_huge(pmde))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
if (!check_pmd(pmd_pfn(pmde), pvmw))
@@ -291,7 +294,7 @@ restart:
goto next_pte;
}
this_pte:
- if (check_pte(pvmw))
+ if (check_pte(pvmw, 1))
return true;
next_pte:
do {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..648038247a8d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -143,8 +143,7 @@ again:
* We are ONLY installing, so avoid unnecessarily
* splitting a present huge page.
*/
- if (pmd_present(*pmd) &&
- (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
continue;
}
@@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
* We are ONLY installing, so avoid unnecessarily
* splitting a present huge page.
*/
- if (pud_present(*pud) &&
- (pud_trans_huge(*pud) || pud_devmap(*pud)))
+ if (pud_present(*pud) && pud_trans_huge(*pud))
continue;
}
@@ -422,7 +420,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm,
{
if (walk_lock == PGWALK_RDLOCK)
mmap_assert_locked(mm);
- else
+ else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
mmap_assert_write_locked(mm);
}
@@ -437,6 +435,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
case PGWALK_WRLOCK_VERIFY:
vma_assert_write_locked(vma);
break;
+ case PGWALK_VMA_RDLOCK_VERIFY:
+ vma_assert_locked(vma);
+ break;
case PGWALK_RDLOCK:
/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
break;
@@ -585,8 +586,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
}
/**
- * walk_page_range_novma - walk a range of pagetables not backed by a vma
- * @mm: mm_struct representing the target process of page table walk
+ * walk_kernel_page_table_range - walk a range of kernel pagetables.
* @start: start address of the virtual address range
* @end: end address of the virtual address range
* @ops: operation to call during the walk
@@ -596,17 +596,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
- * walking the kernel pages tables or page tables for firmware.
+ * walking kernel pages tables or page tables for firmware.
*
* Note: Be careful to walk the kernel pages tables, the caller may be need to
* take other effective approaches (mmap lock may be insufficient) to prevent
* the intermediate kernel page tables belonging to the specified address range
* from being freed (e.g. memory hot-remove).
*/
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+int walk_kernel_page_table_range(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+ struct mm_struct *mm = &init_mm;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end)
+ return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
+ /*
+ * Kernel intermediate page tables are usually not freed, so the mmap
+ * read lock is sufficient. But there are some exceptions.
+ * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+ * to prevent the intermediate kernel pages tables belonging to the
+ * specified address range from being freed. The caller should take
+ * other actions to prevent this race.
+ */
+ mmap_assert_locked(mm);
+
+ return walk_pgd_range(start, end, &walk);
+}
+
+/**
+ * walk_page_range_debug - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback.
+ *
+ * This is for debugging purposes ONLY.
+ */
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
- pgd_t *pgd,
- void *private)
+ pgd_t *pgd, void *private)
{
struct mm_walk walk = {
.ops = ops,
@@ -616,34 +660,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
.no_vma = true
};
+ /* For convenience, we allow traversal of kernel mappings. */
+ if (mm == &init_mm)
+ return walk_kernel_page_table_range(start, end, ops,
+ pgd, private);
if (start >= end || !walk.mm)
return -EINVAL;
if (!check_ops_valid(ops))
return -EINVAL;
/*
- * 1) For walking the user virtual address space:
- *
* The mmap lock protects the page walker from changes to the page
* tables during the walk. However a read lock is insufficient to
* protect those areas which don't have a VMA as munmap() detaches
* the VMAs before downgrading to a read lock and actually tearing
* down PTEs/page tables. In which case, the mmap write lock should
- * be hold.
- *
- * 2) For walking the kernel virtual address space:
- *
- * The kernel intermediate page tables usually do not be freed, so
- * the mmap map read lock is sufficient. But there are some exceptions.
- * E.g. memory hot-remove. In which case, the mmap lock is insufficient
- * to prevent the intermediate kernel pages tables belonging to the
- * specified address range from being freed. The caller should take
- * other actions to prevent this race.
+ * be held.
*/
- if (mm == &init_mm)
- mmap_assert_locked(walk.mm);
- else
- mmap_assert_write_locked(walk.mm);
+ mmap_assert_write_locked(mm);
return walk_pgd_range(start, end, &walk);
}
@@ -872,7 +906,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
* TODO: FW_MIGRATION support for PUD migration entries
* once there are relevant users.
*/
- if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
+ if (!pud_present(pud) || pud_special(pud)) {
spin_unlock(ptl);
goto not_found;
} else if (!pud_leaf(pud)) {
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index dd3590dfc23d..9b9d5d6accae 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * mm/percpu-debug.c
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
diff --git a/mm/percpu.c b/mm/percpu.c
index d8dd31a2e407..81462ce5866e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
/* allocate chunk */
alloc_size = struct_size(chunk, populated,
BITS_TO_LONGS(region_size >> PAGE_SHIFT));
- chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
INIT_LIST_HEAD(&chunk->list);
@@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
region_bits = pcpu_chunk_map_bits(chunk);
alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
- chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->alloc_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size =
BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
- chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->bound_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
- chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->md_blocks)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
-
+ chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
/* first chunk is free to use */
chunk->obj_exts = NULL;
@@ -1747,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
bool is_atomic;
bool do_warn;
struct obj_cgroup *objcg = NULL;
- static int warn_limit = 10;
+ static atomic_t warn_limit = ATOMIC_INIT(10);
struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
@@ -1758,7 +1745,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
- is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+ is_atomic = !gfpflags_allow_blocking(gfp);
do_warn = !(gfp & __GFP_NOWARN);
/*
@@ -1917,13 +1904,17 @@ fail_unlock:
fail:
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
- if (do_warn && warn_limit) {
- pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
- if (!is_atomic)
- dump_stack();
- if (!--warn_limit)
- pr_info("limit reached, disable warning\n");
+ if (do_warn) {
+ int remaining = atomic_dec_if_positive(&warn_limit);
+
+ if (remaining >= 0) {
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
+ if (!is_atomic)
+ dump_stack();
+ if (remaining == 0)
+ pr_info("limit reached, disable warning\n");
+ }
}
if (is_atomic) {
@@ -2204,7 +2195,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
* to grow other chunks. This then gives pcpu_reclaim_populated() time
* to move fully free chunks to the active list to be freed if
* appropriate.
+ *
+ * Enforce GFP_NOIO allocations because we have pcpu_alloc users
+ * constrained to GFP_NOIO/NOFS contexts and they could form lock
+ * dependency through pcpu_alloc_mutex
*/
+ unsigned int flags = memalloc_noio_save();
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
@@ -2215,6 +2211,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
+ memalloc_noio_restore(flags);
}
/**
@@ -2595,28 +2592,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* process group information and build config tables accordingly */
alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
- group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!group_offsets)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
- group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!group_sizes)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
- unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!unit_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
- unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!unit_off)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -2685,12 +2670,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_free_slot = pcpu_sidelined_slot + 1;
pcpu_to_depopulate_slot = pcpu_free_slot + 1;
pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
- pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
sizeof(pcpu_chunk_lists[0]),
SMP_CACHE_BYTES);
- if (!pcpu_chunk_lists)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
@@ -3099,7 +3081,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3130,7 +3112,7 @@ out_free:
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
@@ -3155,25 +3137,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
pmd_t *pmd;
if (pgd_none(*pgd)) {
- p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- if (!p4d)
- goto err_alloc;
- pgd_populate(&init_mm, pgd, p4d);
+ p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
+ pgd_populate_kernel(addr, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
- pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- if (!pud)
- goto err_alloc;
- p4d_populate(&init_mm, p4d, pud);
+ pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ p4d_populate_kernel(addr, p4d, pud);
}
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
- pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
- if (!pmd)
- goto err_alloc;
+ pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
pud_populate(&init_mm, pud, pmd);
}
@@ -3181,16 +3157,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
if (!pmd_present(*pmd)) {
pte_t *new;
- new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
- if (!new)
- goto err_alloc;
+ new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
pmd_populate_kernel(&init_mm, pmd, new);
}
return;
-
-err_alloc:
- panic("%s: Failed to allocate memory\n", __func__);
}
/**
@@ -3237,10 +3208,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
- if (!pages)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- pages_size);
+ pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);
/* allocate pages */
j = 0;
@@ -3282,7 +3250,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
@@ -3391,7 +3359,7 @@ void __init setup_per_cpu_areas(void)
*/
unsigned long pcpu_nr_pages(void)
{
- return pcpu_nr_populated * pcpu_nr_units;
+ return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
}
/*
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a882f2b10f9..567e2d084071 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -139,8 +139,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
@@ -153,7 +152,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
pud_t pud;
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
- VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+ VM_BUG_ON(!pud_trans_huge(*pudp));
pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
return pud;
@@ -293,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
goto nomap;
- if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
new file mode 100644
index 000000000000..7e9455a18aae
--- /dev/null
+++ b/mm/pt_reclaim.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hugetlb.h>
+#include <asm-generic/tlb.h>
+#include <asm/pgalloc.h>
+
+#include "internal.h"
+
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return details && details->reclaim_pt && (end - start >= PMD_SIZE);
+}
+
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
+{
+ spinlock_t *pml = pmd_lockptr(mm, pmd);
+
+ if (!spin_trylock(pml))
+ return false;
+
+ *pmdval = pmdp_get_lockless(pmd);
+ pmd_clear(pmd);
+ spin_unlock(pml);
+
+ return true;
+}
+
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval)
+{
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+}
+
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb)
+{
+ pmd_t pmdval;
+ spinlock_t *pml, *ptl = NULL;
+ pte_t *start_pte, *pte;
+ int i;
+
+ pml = pmd_lock(mm, pmd);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl);
+ if (!start_pte)
+ goto out_ptl;
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ /* Check if it is empty PTE page */
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ free_pte(mm, addr, tlb, pmdval);
+
+ return;
+out_ptl:
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (ptl != pml)
+ spin_unlock(pml);
+}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 106e1d66e9f9..b600c7f864b8 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,6 +4,7 @@
#include <linux/debugfs.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
+#include "internal.h"
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
@@ -18,7 +19,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
{
struct ptdump_state *st = walk->private;
- st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+ st->note_page_pte(st, addr, kasan_early_shadow_pte[0]);
walk->action = ACTION_CONTINUE;
@@ -38,11 +39,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 0, pgd_val(val));
+ if (st->effective_prot_pgd)
+ st->effective_prot_pgd(st, val);
if (pgd_leaf(val)) {
- st->note_page(st, addr, 0, pgd_val(val));
+ st->note_page_pgd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -61,11 +62,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 1, p4d_val(val));
+ if (st->effective_prot_p4d)
+ st->effective_prot_p4d(st, val);
if (p4d_leaf(val)) {
- st->note_page(st, addr, 1, p4d_val(val));
+ st->note_page_p4d(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -84,11 +85,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 2, pud_val(val));
+ if (st->effective_prot_pud)
+ st->effective_prot_pud(st, val);
if (pud_leaf(val)) {
- st->note_page(st, addr, 2, pud_val(val));
+ st->note_page_pud(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -106,10 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 3, pmd_val(val));
+ if (st->effective_prot_pmd)
+ st->effective_prot_pmd(st, val);
if (pmd_leaf(val)) {
- st->note_page(st, addr, 3, pmd_val(val));
+ st->note_page_pmd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -122,10 +123,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
struct ptdump_state *st = walk->private;
pte_t val = ptep_get_lockless(pte);
- if (st->effective_prot)
- st->effective_prot(st, 4, pte_val(val));
+ if (st->effective_prot_pte)
+ st->effective_prot_pte(st, val);
- st->note_page(st, addr, 4, pte_val(val));
+ st->note_page_pte(st, addr, val);
return 0;
}
@@ -134,9 +135,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
-
- st->note_page(st, addr, depth, 0);
-
+ pte_t pte_zero = {0};
+ pmd_t pmd_zero = {0};
+ pud_t pud_zero = {0};
+ p4d_t p4d_zero = {0};
+ pgd_t pgd_zero = {0};
+
+ switch (depth) {
+ case 4:
+ st->note_page_pte(st, addr, pte_zero);
+ break;
+ case 3:
+ st->note_page_pmd(st, addr, pmd_zero);
+ break;
+ case 2:
+ st->note_page_pud(st, addr, pud_zero);
+ break;
+ case 1:
+ st->note_page_p4d(st, addr, p4d_zero);
+ break;
+ case 0:
+ st->note_page_pgd(st, addr, pgd_zero);
+ break;
+ default:
+ break;
+ }
return 0;
}
@@ -153,16 +176,18 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
+ get_online_mems();
mmap_write_lock(mm);
while (range->start != range->end) {
- walk_page_range_novma(mm, range->start, range->end,
+ walk_page_range_debug(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
mmap_write_unlock(mm);
+ put_online_mems();
/* Flush out the last page */
- st->note_page(st, 0, -1, 0);
+ st->note_page_flush(st);
}
static int check_wx_show(struct seq_file *m, void *v)
diff --git a/mm/readahead.c b/mm/readahead.c
index e151f4b13ca4..406756d34309 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac)
if (aops->readahead) {
aops->readahead(rac);
- /*
- * Clean up the remaining folios. The sizes in ->ra
- * may be used to size the next readahead, so make sure
- * they accurately reflect what happened.
- */
+ /* Clean up the remaining folios. */
while ((folio = readahead_folio(rac)) != NULL) {
- unsigned long nr = folio_nr_pages(folio);
-
folio_get(folio);
- rac->ra->size -= nr;
- if (rac->ra->async_size >= nr) {
- rac->ra->async_size -= nr;
- filemap_remove_folio(folio);
- }
+ filemap_remove_folio(folio);
folio_unlock(folio);
folio_put(folio);
}
@@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac)
BUG_ON(readahead_count(rac));
}
+static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
+ gfp_t gfp_mask, unsigned int order)
+{
+ struct folio *folio;
+
+ folio = filemap_alloc_folio(gfp_mask, order);
+ if (folio && ractl->dropbehind)
+ __folio_set_dropbehind(folio);
+
+ return folio;
+}
+
/**
* page_cache_ra_unbounded - Start unchecked readahead.
* @ractl: Readahead control.
@@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
continue;
}
- folio = filemap_alloc_folio(gfp_mask,
- mapping_min_folio_order(mapping));
+ folio = ractl_alloc_folio(ractl, gfp_mask,
+ mapping_min_folio_order(mapping));
if (!folio)
break;
@@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
int err;
- struct folio *folio = filemap_alloc_folio(gfp, order);
+ struct folio *folio = ractl_alloc_folio(ractl, gfp, order);
if (!folio)
return -ENOMEM;
@@ -455,34 +457,32 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
}
void page_cache_ra_order(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned int new_order)
+ struct file_ra_state *ra)
{
struct address_space *mapping = ractl->mapping;
- pgoff_t index = readahead_index(ractl);
+ pgoff_t start = readahead_index(ractl);
+ pgoff_t index = start;
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
pgoff_t mark = index + ra->size - ra->async_size;
unsigned int nofs;
int err = 0;
gfp_t gfp = readahead_gfp_mask(mapping);
- unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
+ unsigned int new_order = ra->order;
- /*
- * Fallback when size < min_nrpages as each folio should be
- * at least min_nrpages anyway.
- */
- if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
+ if (!mapping_large_folio_support(mapping)) {
+ ra->order = 0;
goto fallback;
+ }
limit = min(limit, index + ra->size - 1);
- if (new_order < mapping_max_folio_order(mapping))
- new_order += 2;
-
new_order = min(mapping_max_folio_order(mapping), new_order);
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
new_order = max(new_order, min_order);
+ ra->order = new_order;
+
/* See comment in page_cache_ra_unbounded() */
nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
@@ -516,12 +516,18 @@ void page_cache_ra_order(struct readahead_control *ractl,
/*
* If there were already pages in the page cache, then we may have
* left some gaps. Let the regular readahead code take care of this
- * situation.
+ * situation below.
*/
if (!err)
return;
fallback:
- do_page_cache_ra(ractl, ra->size, ra->async_size);
+ /*
+ * ->readahead() may have updated readahead window size so we have to
+ * check there's still something to read.
+ */
+ if (ra->size > index - start)
+ do_page_cache_ra(ractl, ra->size - (index - start),
+ ra->async_size);
}
static unsigned long ractl_max_pages(struct readahead_control *ractl,
@@ -608,8 +614,9 @@ void page_cache_sync_ra(struct readahead_control *ractl,
ra->size = min(contig_count + req_count, max_pages);
ra->async_size = 1;
readit:
+ ra->order = 0;
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, 0);
+ page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
@@ -619,8 +626,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
unsigned long max_pages;
struct file_ra_state *ra = ractl->ra;
pgoff_t index = readahead_index(ractl);
- pgoff_t expected, start;
- unsigned int order = folio_order(folio);
+ pgoff_t expected, start, end, aligned_end, align;
/* no readahead */
if (!ra->ra_pages)
@@ -643,7 +649,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
* Ramp up sizes, and push forward the readahead window.
*/
expected = round_down(ra->start + ra->size - ra->async_size,
- 1UL << order);
+ folio_nr_pages(folio));
if (index == expected) {
ra->start += ra->size;
/*
@@ -651,7 +657,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
* the readahead window.
*/
ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
- ra->async_size = ra->size;
goto readit;
}
@@ -672,18 +677,30 @@ void page_cache_async_ra(struct readahead_control *ractl,
ra->size = start - index; /* old async_size */
ra->size += req_count;
ra->size = get_next_ra_size(ra, max_pages);
- ra->async_size = ra->size;
readit:
+ ra->order += 2;
+ align = 1UL << min(ra->order, ffs(max_pages) - 1);
+ end = ra->start + ra->size;
+ aligned_end = round_down(end, align);
+ if (aligned_end > ra->start)
+ ra->size -= end - aligned_end;
+ ra->async_size = ra->size;
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, order);
+ page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -691,9 +708,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
@@ -754,7 +777,7 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, min_order);
+ folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
if (!folio)
return;
@@ -783,7 +806,7 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, min_order);
+ folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
if (!folio)
return;
diff --git a/mm/rmap.c b/mm/rmap.c
index c6c4d4ea29a7..568198e9efc2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -503,12 +503,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio)
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
- if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
- anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
@@ -550,12 +550,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
retry:
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
- if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
- anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
@@ -672,7 +672,7 @@ void try_to_unmap_flush_dirty(void)
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -681,7 +681,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
tlb_ubc->flush_required = true;
/*
@@ -746,7 +746,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
- arch_flush_tlb_batched_pending(mm);
+ flush_tlb_mm(mm);
/*
* If the new TLB flushing is pending during flushing, leave
* mm->tlb_flush_batched as is, to avoid losing flushing.
@@ -757,7 +757,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
}
@@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
* @vma: The VMA we need to know the address in.
*
* Calculates the user virtual address of this page in the specified VMA.
- * It is the caller's responsibililty to check the page is actually
+ * It is the caller's responsibility to check the page is actually
* within the VMA. There may not currently be a PTE pointing at this
* page, but if a page fault occurs at this address, this is the page
* which will be accessed.
@@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio,
const struct page *page, const struct vm_area_struct *vma)
{
if (folio_test_anon(folio)) {
- struct anon_vma *page__anon_vma = folio_anon_vma(folio);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
- if (!vma->anon_vma || !page__anon_vma ||
- vma->anon_vma->root != page__anon_vma->root)
+ if (!vma->anon_vma || !anon_vma ||
+ vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
@@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio,
return -EFAULT;
}
- /* KSM folios don't reach here because of the !page__anon_vma check */
+ /* KSM folios don't reach here because of the !anon_vma check */
return vma_address(vma, page_pgoff(folio, page), 1);
}
@@ -839,7 +839,7 @@ out:
struct folio_referenced_arg {
int mapcount;
int referenced;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
struct mem_cgroup *memcg;
};
@@ -889,7 +889,7 @@ static bool folio_referenced_one(struct folio *folio,
if ((!atomic_read(&vma->vm_mm->mm_users) ||
check_stable_address_space(vma->vm_mm)) &&
folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_likely_mapped_shared(folio)) {
+ !folio_maybe_mapped_shared(folio)) {
pra->referenced = -1;
page_vma_mapped_walk_done(&pvmw);
return false;
@@ -984,7 +984,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
* the function bailed out due to rmap lock contention.
*/
int folio_referenced(struct folio *folio, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags)
+ struct mem_cgroup *memcg, vm_flags_t *vm_flags)
{
bool we_locked = false;
struct folio_referenced_arg pra = {
@@ -1044,6 +1044,14 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
pte_t entry = ptep_get(pte);
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are clean and not writable from a
+ * CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (!pte_present(entry))
+ continue;
if (!pte_dirty(entry) && !pte_write(entry))
continue;
@@ -1127,6 +1135,80 @@ int folio_mkclean(struct folio *folio)
}
EXPORT_SYMBOL_GPL(folio_mkclean);
+struct wrprotect_file_state {
+ int cleaned;
+ pgoff_t pgoff;
+ unsigned long pfn;
+ unsigned long nr_pages;
+};
+
+static bool mapping_wrprotect_range_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
+{
+ struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = state->pfn,
+ .nr_pages = state->nr_pages,
+ .pgoff = state->pgoff,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
+
+ state->cleaned += page_vma_mkclean_one(&pvmw);
+
+ return true;
+}
+
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked);
+
+/**
+ * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
+ *
+ * @mapping: The mapping whose reverse mapping should be traversed.
+ * @pgoff: The page offset at which @pfn is mapped within @mapping.
+ * @pfn: The PFN of the page mapped in @mapping at @pgoff.
+ * @nr_pages: The number of physically contiguous base pages spanned.
+ *
+ * Traverses the reverse mapping, finding all VMAs which contain a shared
+ * mapping of the pages in the specified range in @mapping, and write-protects
+ * them (that is, updates the page tables to mark the mappings read-only such
+ * that a write protection fault arises when the mappings are written to).
+ *
+ * The @pfn value need not refer to a folio, but rather can reference a kernel
+ * allocation which is mapped into userland. We therefore do not require that
+ * the page maps to a folio with a valid mapping or index field, rather the
+ * caller specifies these in @mapping and @pgoff.
+ *
+ * Return: the number of write-protected PTEs, or an error.
+ */
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+ unsigned long pfn, unsigned long nr_pages)
+{
+ struct wrprotect_file_state state = {
+ .cleaned = 0,
+ .pgoff = pgoff,
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&state,
+ .rmap_one = mapping_wrprotect_range_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
+
+ if (!mapping)
+ return 0;
+
+ __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
+ /* locked = */false);
+
+ return state.cleaned;
+}
+EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
+
/**
* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
@@ -1160,8 +1242,8 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
}
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level,
- int *nr_pmdmapped)
+ struct page *page, int nr_pages, struct vm_area_struct *vma,
+ enum rmap_level level, int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
@@ -1176,6 +1258,16 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
break;
}
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
+ if (nr == orig_nr_pages)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
do {
first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1184,15 +1276,34 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
nr = first;
- atomic_add(orig_nr_pages, &folio->_large_mapcount);
+ folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ if (level == RMAP_LEVEL_PMD && first)
+ *nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_inc_return_large_mapcount(folio, vma);
+ if (nr == 1)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
- *nr_pmdmapped = folio_nr_pages(folio);
- nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ /*
+ * We only track PMD mappings of PMD-sized
+ * folios separately.
+ */
+ if (level == RMAP_LEVEL_PMD)
+ *nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1201,7 +1312,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
- atomic_inc(&folio->_large_mapcount);
+ folio_inc_large_mapcount(folio, vma);
break;
}
return nr;
@@ -1223,9 +1334,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
- anon_vma += PAGE_MAPPING_ANON;
+ anon_vma += FOLIO_MAPPING_ANON;
/*
- * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
* simultaneously, so a concurrent reader (eg folio_referenced()'s
* folio_test_anon()) will not see one without the other.
*/
@@ -1256,10 +1367,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
/*
* page_idle does a lockless/optimistic rmap scan on folio->mapping.
* Make sure the compiler doesn't split the stores of anon_vma and
- * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
+ * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
* could mistake the mapping for a struct address_space and crash.
*/
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
folio->index = linear_page_index(vma, address);
}
@@ -1322,7 +1433,7 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
@@ -1338,15 +1449,32 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
case RMAP_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Keep the compiler happy, we don't support anonymous
+ * PUD mappings.
+ */
+ WARN_ON_ONCE(1);
+ break;
}
}
+
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
+ atomic_read(&folio->_mapcount) > 0, folio);
for (i = 0; i < nr_pages; i++) {
struct page *cur_page = page + i;
- /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
- VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
- (folio_test_large(folio) &&
- folio_entire_mapcount(folio) > 1)) &&
+ VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+ folio_entire_mapcount(folio) > 1 &&
+ PageAnonExclusive(cur_page), folio);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ continue;
+
+ /*
+ * While PTE-mapping a THP we have a PMD and a PTE
+ * mapping.
+ */
+ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
PageAnonExclusive(cur_page), folio);
}
@@ -1426,14 +1554,11 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
- const int nr = folio_nr_pages(folio);
const bool exclusive = flags & RMAP_EXCLUSIVE;
- int nr_pmdmapped = 0;
+ int nr = 1, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
- VM_BUG_ON_VMA(address < vma->vm_start ||
- address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
/*
* VM_DROPPABLE mappings don't swap; instead they're just dropped when
@@ -1451,29 +1576,35 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
} else if (!folio_test_pmd_mappable(folio)) {
int i;
+ nr = folio_large_nr_pages(folio);
for (i = 0; i < nr; i++) {
struct page *page = folio_page(folio, i);
- /* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
if (exclusive)
SetPageAnonExclusive(page);
}
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, nr - 1);
- atomic_set(&folio->_nr_pages_mapped, nr);
+ folio_set_large_mapcount(folio, nr, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, nr);
} else {
+ nr = folio_large_nr_pages(folio);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, 0);
- atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
+ folio_set_large_mapcount(folio, 1, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
if (exclusive)
SetPageAnonExclusive(&folio->page);
nr_pmdmapped = nr;
}
+ VM_WARN_ON_ONCE(address < vma->vm_start ||
+ address + (nr << PAGE_SHIFT) > vma->vm_end);
+
__folio_mod_stat(folio, nr, nr_pmdmapped);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
@@ -1486,7 +1617,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
__folio_mod_stat(folio, nr, nr_pmdmapped);
/* See comments in folio_add_anon_rmap_*() */
@@ -1531,6 +1662,27 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @vma: The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
@@ -1548,7 +1700,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
break;
}
- atomic_sub(nr_pages, &folio->_large_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_nr_pages(folio);
+ } else {
+ partially_mapped = nr < folio_large_nr_pages(folio) &&
+ !folio_entire_mapcount(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_sub_large_mapcount(folio, nr_pages, vma);
do {
last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1560,13 +1725,32 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
- atomic_dec(&folio->_large_mapcount);
+ case RMAP_LEVEL_PUD:
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
+ if (level == RMAP_LEVEL_PMD && last)
+ nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_dec_return_large_mapcount(folio, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ } else {
+ partially_mapped = last &&
+ nr < folio_large_nr_pages(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_dec_large_mapcount(folio, vma);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
- nr_pmdmapped = folio_nr_pages(folio);
- nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ if (level == RMAP_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1640,6 +1824,53 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio: The folio to remove the mapping from
+ * @page: The first page to remove
+ * @vma: The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+ struct page_vma_mapped_walk *pvmw,
+ enum ttu_flags flags, pte_t pte)
+{
+ unsigned long end_addr, addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
+ unsigned int max_nr;
+
+ if (flags & TTU_HWPOISON)
+ return 1;
+ if (!folio_test_large(folio))
+ return 1;
+
+ /* We may only batch within a single VMA and a single page table. */
+ end_addr = pmd_addr_end(addr, vma->vm_end);
+ max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+ /* We only support lazyfree batching for now ... */
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return 1;
+ if (pte_unused(pte))
+ return 1;
+
+ return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1648,11 +1879,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1702,9 +1934,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
if (!pvmw.pte) {
- if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
- folio))
- goto walk_done;
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
+ if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
+ goto walk_done;
+ /*
+ * unmap_huge_pmd_locked has either already marked
+ * the folio as swap-backed or decided to retain it
+ * due to GUP or speculative references.
+ */
+ goto walk_abort;
+ }
if (flags & TTU_SPLIT_HUGE_PMD) {
/*
@@ -1712,7 +1951,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* restart so we can process the PTE-mapped THP.
*/
split_huge_pmd_locked(vma, pvmw.address,
- pvmw.pmd, false, folio);
+ pvmw.pmd, false);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
@@ -1722,7 +1961,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
+ } else {
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+ }
+
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1778,24 +2028,31 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else if (likely(pte_present(pteval))) {
+ nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
}
/*
@@ -1805,10 +2062,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
@@ -1822,8 +2075,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -1868,40 +2121,41 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
smp_rmb();
- /*
- * The only page refs must be one from isolation
- * plus the rmap(s) (dropped by discard:).
- */
- if (ref_count == 1 + map_count &&
- (!folio_test_dirty(folio) ||
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE
- * ones can be dropped even if they've
- * been dirtied.
- */
- (vma->vm_flags & VM_DROPPABLE))) {
- dec_mm_counter(mm, MM_ANONPAGES);
- goto discard;
- }
-
- /*
- * If the folio was redirtied, it cannot be
- * discarded. Remap the page to page table.
- */
- set_pte_at(mm, address, pvmw.pte, pteval);
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE ones
- * never get swap backed on failure to drop.
- */
- if (!(vma->vm_flags & VM_DROPPABLE))
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ /*
+ * redirtied either using the page table or a previously
+ * obtained GUP reference.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
- goto walk_abort;
+ goto walk_abort;
+ } else if (ref_count != 1 + map_count) {
+ /*
+ * Additional reference. Could be a GUP reference or any
+ * speculative reference. GUP users must mark the folio
+ * dirty if there was a modification. This folio cannot be
+ * reclaimed right now either way, so act just like nothing
+ * happened.
+ * We'll come back here later and detect if the folio was
+ * dirtied when the additional reference is gone.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
+ goto walk_abort;
+ }
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
+ goto discard;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
+
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1926,10 +2180,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (anon_exclusive)
swp_pte = pte_swp_mkexclusive(swp_pte);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
/*
@@ -1946,13 +2207,21 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
- folio_put(folio);
+ folio_put_refs(folio, nr_pages);
+
+ /*
+ * If we are sure that we batched the entire folio and cleared
+ * all PTEs, we can just optimize and stop right here.
+ */
+ if (nr_pages == folio_nr_pages(folio))
+ goto walk_done;
continue;
walk_abort:
ret = false;
@@ -2013,9 +2282,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
@@ -2031,13 +2300,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
/*
- * unmap_page() in mm/huge_memory.c is the only user of migration with
- * TTU_SPLIT_HUGE_PMD and it wants to freeze.
- */
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, folio);
-
- /*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
@@ -2062,9 +2324,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, true);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
subpage = folio_page(folio,
pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
@@ -2076,30 +2345,25 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
continue;
- }
#endif
+ }
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
-
- if (folio_is_zone_device(folio)) {
- /*
- * Our PTE is a non-present device exclusive entry and
- * calculating the subpage as for the common case would
- * result in an invalid pointer.
- *
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
- subpage = &folio->page;
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
} else {
- subpage = folio_page(folio, pfn - folio_pfn(folio));
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -2155,7 +2419,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else if (likely(pte_present(pteval))) {
flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
@@ -2169,58 +2436,27 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval, address);
+ set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
+ writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
}
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
+ VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
+ !anon_exclusive, folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_device_private(folio)) {
- unsigned long pfn = folio_pfn(folio);
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (anon_exclusive)
- WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
- subpage));
+ if (PageHWPoison(subpage)) {
+ VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = pte_to_swp_entry(pteval);
- if (is_writable_device_private_entry(entry))
- entry = make_writable_migration_entry(pfn);
- else if (anon_exclusive)
- entry = make_readable_exclusive_migration_entry(pfn);
- else
- entry = make_readable_migration_entry(pfn);
- swp_pte = swp_entry_to_pte(entry);
-
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- folio_order(folio));
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
- } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -2230,8 +2466,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -2247,6 +2483,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte,
@@ -2257,8 +2498,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
- !anon_exclusive, subpage);
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
if (folio_test_hugetlb(folio)) {
@@ -2283,7 +2522,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- if (pte_write(pteval))
+ if (writable)
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else if (anon_exclusive)
@@ -2292,15 +2531,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
- if (pte_young(pteval))
- entry = make_migration_entry_young(entry);
- if (pte_dirty(pteval))
- entry = make_migration_entry_dirty(entry);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
hsz);
@@ -2375,190 +2622,139 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
}
#ifdef CONFIG_DEVICE_PRIVATE
-struct make_exclusive_args {
- struct mm_struct *mm;
- unsigned long address;
- void *owner;
- bool valid;
-};
-
-static bool page_make_device_exclusive_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long address, void *priv)
+/**
+ * make_device_exclusive() - Mark a page for exclusive use by a device
+ * @mm: mm_struct of associated target process
+ * @addr: the virtual address to mark for exclusive device access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive PFN swap entry, preventing access through the process
+ * page tables. The function will return with the folio locked and referenced.
+ *
+ * On fault, the device-exclusive entries are replaced with the original PTE
+ * under folio lock, after calling MMU notifiers.
+ *
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the folio lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ *
+ * Notes:
+ * #. This function always operates on individual PTEs mapping individual
+ * pages. PMD-sized THPs are first remapped to be mapped by PTEs before
+ * the conversion happens on a single PTE corresponding to @addr.
+ * #. While concurrent access through the process page tables is prevented,
+ * concurrent access through other page references (e.g., earlier GUP
+ * invocation) is not handled and not supported.
+ * #. device-exclusive entries are considered "clean" and "old" by core-mm.
+ * Device drivers must update the folio state when informed by MMU
+ * notifiers.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
+ */
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- struct make_exclusive_args *args = priv;
- pte_t pteval;
- struct page *subpage;
- bool ret = true;
struct mmu_notifier_range range;
+ struct folio *folio, *fw_folio;
+ struct vm_area_struct *vma;
+ struct folio_walk fw;
+ struct page *page;
swp_entry_t entry;
pte_t swp_pte;
- pte_t ptent;
-
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
- vma->vm_mm, address, min(vma->vm_end,
- address + folio_size(folio)),
- args->owner);
- mmu_notifier_invalidate_range_start(&range);
-
- while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
- ptent = ptep_get(pvmw.pte);
- if (!pte_present(ptent)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- subpage = folio_page(folio,
- pte_pfn(ptent) - folio_pfn(folio));
- address = pvmw.address;
+ int ret;
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(ptent));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
-
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
- /*
- * Check that our target page is still mapped at the expected
- * address.
- */
- if (args->mm == mm && args->address == address &&
- pte_write(pteval))
- args->valid = true;
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- if (pte_write(pteval))
- entry = make_writable_device_exclusive_entry(
- page_to_pfn(subpage));
- else
- entry = make_readable_device_exclusive_entry(
- page_to_pfn(subpage));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
-
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ mmap_assert_locked(mm);
+ addr = PAGE_ALIGN_DOWN(addr);
- /*
- * There is a reference on the page for the swap entry which has
- * been removed, so shouldn't take another.
- */
- folio_remove_rmap_pte(folio, subpage, vma);
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by a device,
+ * the GUP call would undo that first by triggering a fault.
+ *
+ * If any other device would already map this page exclusively, the
+ * fault will trigger a conversion to an ordinary
+ * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
+ */
+retry:
+ page = get_user_page_vma_remote(mm, addr,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ &vma);
+ if (IS_ERR(page))
+ return page;
+ folio = page_folio(page);
+
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
}
- mmu_notifier_invalidate_range_end(&range);
-
- return ret;
-}
-
-/**
- * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
- * @folio: The folio to replace page table entries for.
- * @mm: The mm_struct where the folio is expected to be mapped.
- * @address: Address where the folio is expected to be mapped.
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
- *
- * Tries to remove all the page table entries which are mapping this
- * folio and replace them with special device exclusive swap entries to
- * grant a device exclusive access to the folio.
- *
- * Context: Caller must hold the folio lock.
- * Return: false if the page is still mapped, or if it could not be unmapped
- * from the expected address. Otherwise returns true (success).
- */
-static bool folio_make_device_exclusive(struct folio *folio,
- struct mm_struct *mm, unsigned long address, void *owner)
-{
- struct make_exclusive_args args = {
- .mm = mm,
- .address = address,
- .owner = owner,
- .valid = false,
- };
- struct rmap_walk_control rwc = {
- .rmap_one = page_make_device_exclusive_one,
- .done = folio_not_mapped,
- .anon_lock = folio_lock_anon_vma_read,
- .arg = &args,
- };
+ ret = folio_lock_killable(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
/*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
+ * Inform secondary MMUs that we are going to convert this PTE to
+ * device-exclusive, such that they unmap it now. Note that the
+ * caller must filter this event out to prevent livelocks.
*/
- if (!folio_test_anon(folio))
- return false;
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mm, addr, addr + PAGE_SIZE, owner);
+ mmu_notifier_invalidate_range_start(&range);
- rmap_walk(folio, &rwc);
+ /*
+ * Let's do a second walk and make sure we still find the same page
+ * mapped writable. Note that any page of an anonymous folio can
+ * only be mapped writable using exactly one PTE ("exclusive"), so
+ * there cannot be other mappings.
+ */
+ fw_folio = folio_walk_start(&fw, vma, addr, 0);
+ if (fw_folio != folio || fw.page != page ||
+ fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
+ if (fw_folio)
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ folio_unlock(folio);
+ folio_put(folio);
+ goto retry;
+ }
- return args.valid && !folio_mapcount(folio);
-}
+ /* Nuke the page table entry so we get the uptodate dirty bit. */
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
-/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
- *
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
- *
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
- *
- * A driver using this to program access from a device must use a mmu notifier
- * critical section to hold a device specific lock during programming. Once
- * programming is complete it should drop the page lock and reference after
- * which point CPU access to the page will revoke the exclusive access.
- */
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
-{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
-
- npages = get_user_pages_remote(mm, start, npages,
- FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ /* Set the dirty flag on the folio now the PTE is gone. */
+ if (pte_dirty(fw.pte))
+ folio_mark_dirty(folio);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
- }
-
- return npages;
+ /*
+ * Store the pfn of the page in a special device-exclusive PFN swap PTE.
+ * do_swap_page() will trigger the conversion back while holding the
+ * folio lock.
+ */
+ entry = make_device_exclusive_entry(page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(fw.pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ /* The pte is writable, uffd-wp does not apply. */
+ set_pte_at(mm, addr, fw.ptep, swp_pte);
+
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -2653,35 +2849,37 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_unlock_read(anon_vma);
}
-/*
- * rmap_walk_file - do something to file page using the object-based rmap method
- * @folio: the folio to be handled
- * @rwc: control variable according to each walk type
- * @locked: caller holds relevant rmap lock
+/**
+ * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
+ * of a page mapped within a specified page cache object at a specified offset.
*
- * Find all the mappings of a folio using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
+ * @folio: Either the folio whose mappings to traverse, or if NULL,
+ * the callbacks specified in @rwc will be configured such
+ * as to be able to look up mappings correctly.
+ * @mapping: The page cache object whose mapping VMAs we intend to
+ * traverse. If @folio is non-NULL, this should be equal to
+ * folio_mapping(folio).
+ * @pgoff_start: The offset within @mapping of the page which we are
+ * looking up. If @folio is non-NULL, this should be equal
+ * to folio_pgoff(folio).
+ * @nr_pages: The number of pages mapped by the mapping. If @folio is
+ * non-NULL, this should be equal to folio_nr_pages(folio).
+ * @rwc: The reverse mapping walk control object describing how
+ * the traversal should proceed.
+ * @locked: Is the @mapping already locked? If not, we acquire the
+ * lock.
*/
-static void rmap_walk_file(struct folio *folio,
- struct rmap_walk_control *rwc, bool locked)
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = folio_mapping(folio);
- pgoff_t pgoff_start, pgoff_end;
+ pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
struct vm_area_struct *vma;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_rwsem.
- */
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
+ VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
+ VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
- if (!mapping)
- return;
-
- pgoff_start = folio_pgoff(folio);
- pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
@@ -2696,8 +2894,7 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(vma, pgoff_start,
- folio_nr_pages(folio));
+ unsigned long address = vma_address(vma, pgoff_start, nr_pages);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2710,12 +2907,38 @@ lookup:
if (rwc->done && rwc->done(folio))
goto done;
}
-
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @folio: the folio to be handled
+ * @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
+ *
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ */
+static void rmap_walk_file(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
+{
+ /*
+ * The folio lock not only makes sure that folio->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the structure
+ * at mapping cannot be freed and reused yet, so we can safely take
+ * mapping->i_mmap_rwsem.
+ */
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (!folio->mapping)
+ return;
+
+ __rmap_walk_file(folio, folio->mapping, folio->index,
+ folio_nr_pages(folio), rwc, locked);
+}
+
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 6d783436951f..e7173fcd210c 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -12,7 +12,8 @@
#include <linux/mm.h>
#include <asm/sections.h>
-static const int rodata_test_data = 0xC3;
+#define TEST_VALUE 0xC3
+static const int rodata_test_data = TEST_VALUE;
void rodata_test(void)
{
@@ -20,7 +21,7 @@ void rodata_test(void)
/* test 1: read the value */
/* If this test fails, some previous testrun has clobbered the state */
- if (!rodata_test_data) {
+ if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) {
pr_err("test 1 fails (start data)\n");
return;
}
@@ -33,7 +34,7 @@ void rodata_test(void)
}
/* test 3: check the value hasn't changed */
- if (rodata_test_data == zero) {
+ if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) {
pr_err("test data was changed\n");
return;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 399552814fd0..b59350daffe3 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -54,7 +54,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
- struct page *page;
struct folio *folio;
vm_fault_t ret;
int err;
@@ -65,16 +64,15 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
filemap_invalidate_lock_shared(mapping);
retry:
- page = find_lock_page(mapping, offset);
- if (!page) {
+ folio = filemap_lock_folio(mapping, offset);
+ if (IS_ERR(folio)) {
folio = folio_alloc(gfp | __GFP_ZERO, 0);
if (!folio) {
ret = VM_FAULT_OOM;
goto out;
}
- page = &folio->page;
- err = set_direct_map_invalid_noflush(page);
+ err = set_direct_map_invalid_noflush(folio_page(folio, 0));
if (err) {
folio_put(folio);
ret = vmf_error(err);
@@ -84,13 +82,13 @@ retry:
__folio_mark_uptodate(folio);
err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
- folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
- set_direct_map_default_noflush(page);
+ set_direct_map_default_noflush(folio_page(folio, 0));
+ folio_put(folio);
if (err == -EEXIST)
goto retry;
@@ -98,11 +96,11 @@ retry:
goto out;
}
- addr = (unsigned long)page_address(page);
+ addr = (unsigned long)folio_address(folio);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
}
- vmf->page = page;
+ vmf->page = folio_file_page(folio, vmf->pgoff);
ret = VM_FAULT_LOCKED;
out:
@@ -120,18 +118,18 @@ static int secretmem_release(struct inode *inode, struct file *file)
return 0;
}
-static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
- unsigned long len = vma->vm_end - vma->vm_start;
+ const unsigned long len = desc->end - desc->start;
- if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
return -EAGAIN;
- vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
- vma->vm_ops = &secretmem_vm_ops;
+ desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ desc->vm_ops = &secretmem_vm_ops;
return 0;
}
@@ -143,7 +141,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma)
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
- .mmap = secretmem_mmap,
+ .mmap_prepare = secretmem_mmap_prepare,
};
static int secretmem_migrate_folio(struct address_space *mapping,
@@ -154,7 +152,7 @@ static int secretmem_migrate_folio(struct address_space *mapping,
static void secretmem_free_folio(struct folio *folio)
{
- set_direct_map_default_noflush(&folio->page);
+ set_direct_map_default_noflush(folio_page(folio, 0));
folio_zero_segment(folio, 0, folio_size(folio));
}
@@ -195,21 +193,13 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
- int err;
- inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &qname, NULL);
- if (err) {
- file = ERR_PTR(err);
- goto err_free_inode;
- }
-
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
- O_RDWR, &secretmem_fops);
+ O_RDWR | O_LARGEFILE, &secretmem_fops);
if (IS_ERR(file))
goto err_free_inode;
@@ -223,6 +213,8 @@ static struct file *secretmem_file_create(unsigned long flags)
inode->i_mode |= S_IFREG;
inode->i_size = 0;
+ atomic_inc(&secretmem_users);
+
return file;
err_free_inode:
@@ -256,9 +248,6 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
goto err_put_fd;
}
- file->f_flags |= O_LARGEFILE;
-
- atomic_inc(&secretmem_users);
fd_install(fd, file);
return fd;
@@ -269,7 +258,15 @@ err_put_fd:
static int secretmem_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, SECRETMEM_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+ return 0;
}
static struct file_system_type secretmem_fs = {
@@ -287,9 +284,6 @@ static int __init secretmem_init(void)
if (IS_ERR(secretmem_mnt))
return PTR_ERR(secretmem_mnt);
- /* prevent secretmem mappings from ever getting PROT_EXEC */
- secretmem_mnt->mnt_flags |= MNT_NOEXEC;
-
return 0;
}
fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index e10d6e092462..3b0356a4c344 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include "internal.h"
-#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
/* Pretend that each entry is of this size in directory's i_size */
@@ -99,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -108,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -132,8 +131,7 @@ struct shmem_options {
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
-#define SHMEM_SEEN_NOSWAP 16
-#define SHMEM_SEEN_QUOTA 32
+#define SHMEM_SEEN_QUOTA 16
};
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -293,7 +291,7 @@ bool vma_is_shmem(struct vm_area_struct *vma)
}
static LIST_HEAD(shmem_swaplist);
-static DEFINE_MUTEX(shmem_swaplist_mutex);
+static DEFINE_SPINLOCK(shmem_swaplist_lock);
#ifdef CONFIG_TMPFS_QUOTA
@@ -433,10 +431,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * Return: true if swapped was incremented from 0, for shmem_writeout().
*/
-static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
+static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ bool first_swapped = false;
long freed;
spin_lock(&info->lock);
@@ -447,12 +448,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
- if (swapped > 0)
+ if (swapped > 0) {
+ if (info->swapped == swapped)
+ first_swapped = true;
freed += swapped;
+ }
if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock);
@@ -460,6 +464,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/* The quota case may block */
if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
+ return first_swapped;
}
bool shmem_charge(struct inode *inode, long pages)
@@ -506,15 +511,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -526,9 +543,9 @@ static bool shmem_confirm_swap(struct address_space *mapping,
* enables huge pages for the mount;
* SHMEM_HUGE_WITHIN_SIZE:
* only allocate huge pages if the page will be fully within i_size,
- * also respect fadvise()/madvise() hints;
+ * also respect madvise() hints;
* SHMEM_HUGE_ADVISE:
- * only allocate huge pages if requested with fadvise()/madvise();
+ * only allocate huge pages if requested with madvise();
*/
#define SHMEM_HUGE_NEVER 0
@@ -553,38 +570,119 @@ static bool shmem_confirm_swap(struct address_space *mapping,
/* ifdef here to avoid bloating shmem.o when not necessary */
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+/**
+ * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
+ * @mapping: Target address_space.
+ * @index: The page index.
+ * @write_end: end of a write, could extend inode size.
+ *
+ * This returns huge orders for folios (when supported) based on the file size
+ * which the mapping currently allows at the given index. The index is relevant
+ * due to alignment considerations the mapping might have. The returned order
+ * may be less than the size passed.
+ *
+ * Return: The orders.
+ */
+static inline unsigned int
+shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
{
+ unsigned int order;
+ size_t size;
+
+ if (!mapping_large_folio_support(mapping) || !write_end)
+ return 0;
+
+ /* Calculate the write size based on the write_end */
+ size = write_end - (index << PAGE_SHIFT);
+ order = filemap_get_order(size);
+ if (!order)
+ return 0;
+
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
+
+ order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
+ return order > 0 ? BIT(order + 1) - 1 : 0;
+}
+
+static unsigned int shmem_get_orders_within_size(struct inode *inode,
+ unsigned long within_size_orders, pgoff_t index,
+ loff_t write_end)
+{
+ pgoff_t aligned_index;
+ unsigned long order;
loff_t i_size;
- if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
- return false;
+ order = highest_order(within_size_orders);
+ while (within_size_orders) {
+ aligned_index = round_up(index + 1, 1 << order);
+ i_size = max(write_end, i_size_read(inode));
+ i_size = round_up(i_size, PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= aligned_index)
+ return within_size_orders;
+
+ order = next_order(&within_size_orders, order);
+ }
+
+ return 0;
+}
+
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
+ 0 : BIT(HPAGE_PMD_ORDER);
+ unsigned long within_size_orders;
+
if (!S_ISREG(inode->i_mode))
- return false;
+ return 0;
if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
+ return 0;
if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
- return true;
+ return maybe_pmd_order;
+ /*
+ * The huge order allocation for anon shmem is controlled through
+ * the mTHP interface, so we still use PMD-sized huge order to
+ * check whether global control is enabled.
+ *
+ * For tmpfs mmap()'s huge order, we still use PMD-sized order to
+ * allocate huge pages due to lack of a write size hint.
+ *
+ * Otherwise, tmpfs will allow getting a highest order hint based on
+ * the size of write and fallocate paths, then will try each allowable
+ * huge orders.
+ */
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
- return true;
+ if (vma)
+ return maybe_pmd_order;
+
+ return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
case SHMEM_HUGE_WITHIN_SIZE:
- index = round_up(index + 1, HPAGE_PMD_NR);
- i_size = max(write_end, i_size_read(inode));
- i_size = round_up(i_size, PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index)
- return true;
+ if (vma)
+ within_size_orders = maybe_pmd_order;
+ else
+ within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
+ index, write_end);
+
+ within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
+ index, write_end);
+ if (within_size_orders > 0)
+ return within_size_orders;
+
fallthrough;
case SHMEM_HUGE_ADVISE:
if (vm_flags & VM_HUGEPAGE)
- return true;
+ return maybe_pmd_order;
fallthrough;
default:
- return false;
+ return 0;
}
}
@@ -779,11 +877,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
return 0;
}
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
{
- return false;
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -803,7 +902,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -815,14 +916,25 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = radix_to_swp_entry(expected);
do {
+ iter = swap;
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
@@ -1180,7 +1292,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1294,11 +1406,11 @@ static void shmem_evict_inode(struct inode *inode)
/* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
/* ...but beware of the race if we peeked too early */
if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
}
}
@@ -1312,9 +1424,9 @@ static void shmem_evict_inode(struct inode *inode)
#endif
}
-static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, struct folio_batch *fbatch,
- pgoff_t *indices, unsigned int type)
+static unsigned int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
struct folio *folio;
@@ -1347,7 +1459,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
}
rcu_read_unlock();
- return xas.xa_index;
+ return folio_batch_count(fbatch);
}
/*
@@ -1365,8 +1477,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1394,8 +1504,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type)
do {
folio_batch_init(&fbatch);
- shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
- if (folio_batch_count(&fbatch) == 0) {
+ if (!shmem_find_swap_entries(mapping, start, &fbatch,
+ indices, type)) {
ret = 0;
break;
}
@@ -1423,7 +1533,8 @@ int shmem_unuse(unsigned int type)
if (list_empty(&shmem_swaplist))
return 0;
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1436,51 +1547,47 @@ int shmem_unuse(unsigned int type)
* (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
- mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
+ spin_lock(&shmem_swaplist_lock);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @plug: swap plug
+ * @folio_list: list to put back folios on split
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+ struct list_head *folio_list)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- swp_entry_t swap;
pgoff_t index;
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
- if (WARN_ON_ONCE(!wbc->for_reclaim))
- goto redirty;
-
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -1506,9 +1613,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, folio_list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1547,47 +1653,66 @@ try_split:
folio_mark_uptodate(folio);
}
- swap = folio_alloc_swap(folio);
- if (!swap.val) {
- if (nr_pages > 1)
- goto try_split;
+ if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
+ bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
+ int error;
- goto redirty;
- }
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the folio is
+ * removed from page cache, when its pagelock no longer
+ * protects the inode from eviction. And do it now, after
+ * we've incremented swapped, because shmem_unuse() will
+ * prune a !swapped inode from the swaplist.
+ */
+ if (first_swapped) {
+ spin_lock(&shmem_swaplist_lock);
+ if (list_empty(&info->swaplist))
+ list_add(&info->swaplist, &shmem_swaplist);
+ spin_unlock(&shmem_swaplist_lock);
+ }
+
+ swap_shmem_alloc(folio->swap, nr_pages);
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
- /*
- * Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the folio is
- * moved to swap cache, when its pagelock no longer protects
- * the inode from eviction. But don't unlock the mutex until
- * we've incremented swapped, because shmem_unuse_inode() will
- * prune a !swapped inode from the swaplist under this mutex.
- */
- mutex_lock(&shmem_swaplist_mutex);
- if (list_empty(&info->swaplist))
- list_add(&info->swaplist, &shmem_swaplist);
-
- if (add_to_swap_cache(folio, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
- NULL) == 0) {
- shmem_recalc_inode(inode, 0, nr_pages);
- swap_shmem_alloc(swap, nr_pages);
- shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
-
- mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
- }
+ error = swap_writeout(folio, plug);
+ if (error != AOP_WRITEPAGE_ACTIVATE) {
+ /* folio has been unlocked */
+ return error;
+ }
- mutex_unlock(&shmem_swaplist_mutex);
- put_swap_folio(folio, swap);
+ /*
+ * The intention here is to avoid holding on to the swap when
+ * zswap was unable to compress and unable to writeback; but
+ * it will be appropriate if other reactivate cases are added.
+ */
+ error = shmem_add_to_page_cache(folio, mapping, index,
+ swp_to_radix_entry(folio->swap),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ /* Swap entry might be erased by racing shmem_free_swap() */
+ if (!error) {
+ shmem_recalc_inode(inode, 0, -nr_pages);
+ swap_free_nr(folio->swap, nr_pages);
+ }
+
+ /*
+ * The delete_from_swap_cache() below could be left for
+ * shrink_folio_list()'s folio_free_swap() to dispose of;
+ * but I'm a little nervous about letting this folio out of
+ * shmem_writeout() in a hybrid half-tmpfs-half-swap state
+ * e.g. folio_mapping(folio) might give an unexpected answer.
+ */
+ delete_from_swap_cache(folio);
+ goto redirty;
+ }
+ if (nr_pages > 1)
+ goto try_split;
redirty:
folio_mark_dirty(folio);
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
- folio_unlock(folio);
- return 0;
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -1688,24 +1813,17 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
{
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
- unsigned long vm_flags = vma ? vma->vm_flags : 0;
- pgoff_t aligned_index;
- bool global_huge;
- loff_t i_size;
- int order;
+ vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
+ unsigned int global_orders;
if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
- global_huge = shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vm_flags);
- if (!vma || !vma_is_anon_shmem(vma)) {
- /*
- * For tmpfs, we now only support PMD sized THP if huge page
- * is enabled, otherwise fallback to order 0.
- */
- return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
- }
+ global_orders = shmem_huge_global_enabled(inode, index, write_end,
+ shmem_huge_force, vma, vm_flags);
+ /* Tmpfs huge pages allocation */
+ if (!vma || !vma_is_anon_shmem(vma))
+ return global_orders;
/*
* Following the 'deny' semantics of the top level, force the huge
@@ -1722,22 +1840,12 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
return READ_ONCE(huge_shmem_orders_inherit);
/* Allow mTHP that will be fully within i_size. */
- order = highest_order(within_size_orders);
- while (within_size_orders) {
- aligned_index = round_up(index + 1, 1 << order);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= aligned_index) {
- mask |= within_size_orders;
- break;
- }
-
- order = next_order(&within_size_orders, order);
- }
+ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
- if (global_huge)
+ if (global_orders > 0)
mask |= READ_ONCE(huge_shmem_orders_inherit);
return THP_ORDERS_ALL_FILE_DEFAULT & mask;
@@ -1810,6 +1918,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long suitable_orders = 0;
struct folio *folio = NULL;
+ pgoff_t aligned_index;
long pages;
int error, order;
@@ -1823,10 +1932,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
order = highest_order(suitable_orders);
while (suitable_orders) {
pages = 1UL << order;
- index = round_down(index, pages);
- folio = shmem_alloc_folio(gfp, order, info, index);
- if (folio)
+ aligned_index = round_down(index, pages);
+ folio = shmem_alloc_folio(gfp, order, info, aligned_index);
+ if (folio) {
+ index = aligned_index;
goto allocated;
+ }
if (pages == HPAGE_PMD_NR)
count_vm_event(THP_FILE_FALLBACK);
@@ -1903,6 +2014,93 @@ unlock:
return ERR_PTR(error);
}
+static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+ struct vm_area_struct *vma, pgoff_t index,
+ swp_entry_t entry, int order, gfp_t gfp)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int nr_pages = 1 << order;
+ struct folio *new;
+ gfp_t alloc_gfp;
+ void *shadow;
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success with further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ alloc_gfp = gfp;
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (WARN_ON_ONCE(order))
+ return ERR_PTR(-EINVAL);
+ } else if (order) {
+ /*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
+ */
+ if ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled() ||
+ non_swapcache_batch(entry, nr_pages) != nr_pages)
+ goto fallback;
+
+ alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+ }
+retry:
+ new = shmem_alloc_folio(alloc_gfp, order, info, index);
+ if (!new) {
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
+ }
+
+ if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
+ alloc_gfp, entry)) {
+ folio_put(new);
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
+ }
+
+ /*
+ * Prevent parallel swapin from proceeding with the swap cache flag.
+ *
+ * Of course there is another possible concurrent scenario as well,
+ * that is to say, the swap cache flag of a large folio has already
+ * been set by swapcache_prepare(), while another thread may have
+ * already split the large swap entry stored in the shmem mapping.
+ * In this case, shmem_add_to_page_cache() will help identify the
+ * concurrent swapin and return -EEXIST.
+ */
+ if (swapcache_prepare(entry, nr_pages)) {
+ folio_put(new);
+ new = ERR_PTR(-EEXIST);
+ /* Try smaller folio to avoid cache conflict */
+ goto fallback;
+ }
+
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ new->swap = entry;
+
+ memcg1_swapin(entry, nr_pages);
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(new, shadow);
+ folio_add_lru(new);
+ swap_read_folio(new, NULL);
+ return new;
+fallback:
+ /* Order 0 swapin failed, nothing to fallback to, abort */
+ if (!order)
+ return new;
+ entry.val += index - round_down(index, nr_pages);
+ alloc_gfp = gfp;
+ nr_pages = 1;
+ order = 0;
+ goto retry;
+}
+
/*
* When a page is moved from swapcache to shmem filecache (either by the
* usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
@@ -2006,7 +2204,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
- struct folio *folio, swp_entry_t swap)
+ struct folio *folio, swp_entry_t swap,
+ bool skip_swapcache)
{
struct address_space *mapping = inode->i_mapping;
swp_entry_t swapin_error;
@@ -2022,7 +2221,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
- delete_from_swap_cache(folio);
+ if (!skip_swapcache)
+ delete_from_swap_cache(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2037,15 +2237,16 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
- void *alloced_shadow = NULL;
- int alloced_order = 0, i;
+ int split_order = 0, entry_order;
+ int i;
/* Convert user data gfp flags to xarray node gfp flags */
gfp &= GFP_RECLAIM_MASK;
for (;;) {
- int order = -1, split_order = 0;
void *old = NULL;
+ int cur_order;
+ pgoff_t swap_index;
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -2054,60 +2255,56 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
goto unlock;
}
- order = xas_get_order(&xas);
+ entry_order = xas_get_order(&xas);
- /* Swap entry may have changed before we re-acquire the lock */
- if (alloced_order &&
- (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
+ if (!entry_order)
+ goto unlock;
/* Try to split large swap entry in pagecache */
- if (order > 0) {
- if (!alloced_order) {
- split_order = order;
+ cur_order = entry_order;
+ swap_index = round_down(index, 1 << entry_order);
+
+ split_order = xas_try_split_min_order(cur_order);
+
+ while (cur_order > 0) {
+ pgoff_t aligned_index =
+ round_down(index, 1 << cur_order);
+ pgoff_t swap_offset = aligned_index - swap_index;
+
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, cur_order);
+ if (xas_error(&xas))
goto unlock;
- }
- xas_split(&xas, old, order);
/*
* Re-set the swap entry after splitting, and the swap
* offset of the original large entry must be continuous.
*/
- for (i = 0; i < 1 << order; i++) {
- pgoff_t aligned_index = round_down(index, 1 << order);
+ for (i = 0; i < 1 << cur_order;
+ i += (1 << split_order)) {
swp_entry_t tmp;
- tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
+ tmp = swp_entry(swp_type(swap),
+ swp_offset(swap) + swap_offset +
+ i);
__xa_store(&mapping->i_pages, aligned_index + i,
swp_to_radix_entry(tmp), 0);
}
+ cur_order = split_order;
+ split_order = xas_try_split_min_order(split_order);
}
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
-error:
if (xas_error(&xas))
return xas_error(&xas);
- return alloced_order;
+ return 0;
}
/*
@@ -2124,73 +2321,109 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swap, index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
- swp_entry_t swap;
- int error, nr_pages;
+ bool skip_swapcache = false;
+ int error, nr_pages, order;
+ pgoff_t offset;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
- swap = radix_to_swp_entry(*foliop);
+ index_entry = radix_to_swp_entry(*foliop);
+ swap = index_entry;
*foliop = NULL;
- if (is_poisoned_swp_entry(swap))
+ if (is_poisoned_swp_entry(index_entry))
return -EIO;
- si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ si = get_swap_device(index_entry);
+ order = shmem_confirm_swap(mapping, index, index_entry);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
+
+ /* index may point to the middle of a large entry, get the sub entry */
+ if (order) {
+ offset = index - round_down(index, 1 << order);
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
if (!folio) {
- int split_order;
-
- /* Or update major stats only when swapin succeeds?? */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ /* Direct swapin skipping swap cache & readahead */
+ folio = shmem_swap_alloc_folio(inode, vma, index,
+ index_entry, order, gfp);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
+ folio = NULL;
+ goto failed;
+ }
+ skip_swapcache = true;
+ } else {
+ /* Cached swapin only supports order 0 folio */
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
+ if (!folio) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+ }
+ if (order > folio_order(folio)) {
/*
- * Now swap device can only swap in order 0 folio, then we
- * should split the large swap entry stored in the pagecache
- * if necessary.
- */
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
+ * Swapin may get smaller folios due to various reasons:
+ * It may fallback to order 0 due to memory pressure or race,
+ * swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
*/
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
-
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
+ error = shmem_split_large_entry(inode, index, index_entry, gfp);
+ if (error)
+ goto failed_nolock;
+ }
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
- goto failed;
- }
+ /*
+ * If the folio is large, round down swap and index by folio size.
+ * No matter what race occurs, the swap layer ensures we either get
+ * a valid folio that has its swap entry aligned by size, or a
+ * temporarily invalid one which we'll abort very soon and retry.
+ *
+ * shmem_add_to_page_cache ensures the whole range contains expected
+ * entries and prevents any corruption, so any race split is fine
+ * too, it will succeed as long as the entries are still there.
+ */
+ nr_pages = folio_nr_pages(folio);
+ if (nr_pages > 1) {
+ swap.val = round_down(swap.val, nr_pages);
+ index = round_down(index, nr_pages);
}
- /* We have to do this with folio locked to prevent races */
+ /*
+ * We have to do this with the folio locked to prevent races.
+ * The shmem_confirm_swap below only checks if the first swap
+ * entry matches the folio, that's enough to ensure the folio
+ * is not used outside of shmem, as shmem swap entries
+ * and swap cache folios are never partially freed.
+ */
folio_lock(folio);
- if (!folio_test_swapcache(folio) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
+ shmem_confirm_swap(mapping, index, swap) < 0 ||
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
@@ -2213,8 +2446,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
- error = shmem_add_to_page_cache(folio, mapping,
- round_down(index, nr_pages),
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
@@ -2224,7 +2456,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- delete_from_swap_cache(folio);
+ if (skip_swapcache) {
+ folio->swap.val = 0;
+ swapcache_clear(si, swap, nr_pages);
+ } else {
+ delete_from_swap_cache(folio);
+ }
folio_mark_dirty(folio);
swap_free_nr(swap, nr_pages);
put_swap_device(si);
@@ -2232,15 +2469,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
- shmem_set_folio_swapin_error(inode, index, folio, swap);
+ shmem_set_folio_swapin_error(inode, index, folio, swap,
+ skip_swapcache);
unlock:
- if (folio) {
+ if (folio)
folio_unlock(folio);
+failed_nolock:
+ if (skip_swapcache)
+ swapcache_clear(si, folio->swap, folio_nr_pages(folio));
+ if (folio)
folio_put(folio);
- }
put_swap_device(si);
return error;
@@ -2752,12 +2993,6 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
- int ret;
-
- ret = seal_check_write(info->seals, vma);
- if (ret)
- return ret;
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
@@ -3098,9 +3333,9 @@ static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
static int
-shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -3121,8 +3356,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
if (ret)
return ret;
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
@@ -3133,9 +3367,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
}
static int
-shmem_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -3329,7 +3563,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3356,7 +3590,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3443,7 +3677,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
@@ -3600,7 +3834,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -3731,16 +3965,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -3917,6 +4151,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
int len;
struct inode *inode;
struct folio *folio;
+ char *link;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3938,12 +4173,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
- inode->i_link = kmemdup(symname, len, GFP_KERNEL);
- if (!inode->i_link) {
+ link = kmemdup(symname, len, GFP_KERNEL);
+ if (!link) {
error = -ENOMEM;
goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
+ inode_set_cached_link(inode, link, len - 1);
} else {
inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops;
@@ -4014,7 +4250,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
#ifdef CONFIG_TMPFS_XATTR
-static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
@@ -4024,7 +4260,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
static int shmem_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -4507,7 +4743,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Turning off swap in unprivileged tmpfs mounts unsupported");
}
ctx->noswap = true;
- ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
case Opt_quota:
if (fc->user_ns != &init_user_ns)
@@ -4583,48 +4818,37 @@ bad_value:
return invalfc(fc, "Bad value for '%s'", param->key);
}
-static int shmem_parse_options(struct fs_context *fc, void *data)
+static char *shmem_next_opt(char **s)
{
- char *options = data;
+ char *sbegin = *s;
+ char *p;
- if (options) {
- int err = security_sb_eat_lsm_opts(options, &fc->security);
- if (err)
- return err;
- }
+ if (sbegin == NULL)
+ return NULL;
- while (options != NULL) {
- char *this_char = options;
- for (;;) {
- /*
- * NUL-terminate this option: unfortunately,
- * mount options form a comma-separated list,
- * but mpol's nodelist may also contain commas.
- */
- options = strchr(options, ',');
- if (options == NULL)
- break;
- options++;
- if (!isdigit(*options)) {
- options[-1] = '\0';
- break;
- }
- }
- if (*this_char) {
- char *value = strchr(this_char, '=');
- size_t len = 0;
- int err;
-
- if (value) {
- *value++ = '\0';
- len = strlen(value);
- }
- err = vfs_parse_fs_string(fc, this_char, value, len);
- if (err < 0)
- return err;
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ for (;;) {
+ p = strchr(*s, ',');
+ if (p == NULL)
+ break;
+ *s = p + 1;
+ if (!isdigit(*(p+1))) {
+ *p = '\0';
+ return sbegin;
}
}
- return 0;
+
+ *s = NULL;
+ return sbegin;
+}
+
+static int shmem_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
}
/*
@@ -4668,14 +4892,15 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
- if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+
+ /*
+ * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
+ * counterpart for (re-)enabling swap.
+ */
+ if (ctx->noswap && !sbinfo->noswap) {
err = "Cannot disable swap on remount";
goto out;
}
- if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
- err = "Cannot enable swap on remount if it was disabled on first mount";
- goto out;
- }
if (ctx->seen & SHMEM_SEEN_QUOTA &&
!sb_any_quota_loaded(fc->root->d_sb)) {
@@ -4822,7 +5047,6 @@ static void shmem_put_super(struct super_block *sb)
static const struct dentry_operations shmem_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
- .d_delete = always_delete_dentry,
};
#endif
@@ -4870,7 +5094,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
if (ctx->encoding) {
sb->s_encoding = ctx->encoding;
- sb->s_d_op = &shmem_ci_dentry_ops;
+ set_default_d_op(sb, &shmem_ci_dentry_ops);
if (ctx->strict_encoding)
sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
}
@@ -4879,6 +5103,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#else
sb->s_flags |= SB_NOUSER;
#endif /* CONFIG_TMPFS */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sbinfo->max_blocks = ctx->blocks;
sbinfo->max_inodes = ctx->inodes;
sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
@@ -4891,7 +5116,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->gid = ctx->gid;
sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
- sbinfo->huge = ctx->huge;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ else
+ sbinfo->huge = tmpfs_huge;
+#endif
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
@@ -4969,7 +5199,7 @@ static const struct fs_context_operations shmem_fs_context_ops = {
.free = shmem_free_fc,
.get_tree = shmem_get_tree,
#ifdef CONFIG_TMPFS
- .parse_monolithic = shmem_parse_options,
+ .parse_monolithic = shmem_parse_monolithic,
.parse_param = shmem_parse_one,
.reconfigure = shmem_reconfigure,
#endif
@@ -5027,7 +5257,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
@@ -5442,6 +5671,21 @@ static int __init setup_transparent_hugepage_shmem(char *str)
}
__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
+static int __init setup_transparent_hugepage_tmpfs(char *str)
+{
+ int huge;
+
+ huge = shmem_parse_huge(str);
+ if (huge < 0) {
+ pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
+ return huge;
+ }
+
+ tmpfs_huge = huge;
+ return 1;
+}
+__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
+
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_shmem(char *str)
{
@@ -5482,19 +5726,19 @@ static int __init setup_thp_shmem(char *str)
THP_ORDERS_ALL_FILE_DEFAULT);
}
- if (start == -EINVAL) {
+ if (start < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
start_size);
goto err;
}
- if (end == -EINVAL) {
+ if (end < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
end_size);
goto err;
}
- if (start < 0 || end < 0 || start > end)
+ if (start > end)
goto err;
nr = end - start + 1;
@@ -5631,12 +5875,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
@@ -5661,7 +5905,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
* underlying inode. So users of this interface must do LSM checks at a
* higher layer. The users are the big_key and shm implementations. LSM
* checks are provided at the key or shm level rather than the inode.
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5673,7 +5917,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/**
* shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5686,7 +5930,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
* @mnt: the tmpfs mount where the file will be created
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5747,8 +5991,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
- gfp, NULL, NULL);
+ error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
+ &folio, SGP_CACHE, gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 43afb56abbd3..41999e94a56d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
-
+ managed_pages += zone_managed_pages(zone);
if (is_highmem(zone)) {
managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
+
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
val->mem_unit = PAGE_SIZE;
}
#endif
@@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -252,7 +246,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" shmem_pmdmapped:%lukB"
" anon_thp:%lukB"
#endif
- " writeback_tmp:%lukB"
" kernel_stack:%lukB"
#ifdef CONFIG_SHADOW_CALL_STACK
" shadow_call_stack:%lukB"
@@ -260,6 +253,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" pagetables:%lukB"
" sec_pagetables:%lukB"
" all_unreclaimable? %s"
+ " Balloon:%lukB"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
@@ -278,14 +272,14 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
#ifdef CONFIG_SHADOW_CALL_STACK
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES));
+ str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES),
+ K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
for_each_populated_zone(zone) {
@@ -309,6 +303,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" low:%lukB"
" high:%lukB"
" reserved_highatomic:%luKB"
+ " free_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
@@ -330,6 +325,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic),
+ K(zone->nr_free_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
@@ -339,7 +335,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 4a85b94d12ce..20eaee3e97f7 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -195,8 +195,6 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
{
- struct dentry *entry;
- char buf[128];
const char *new, *old;
va_list ap;
int ret = 0;
@@ -213,23 +211,17 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
old = shrinker->name;
shrinker->name = new;
- if (shrinker->debugfs_entry) {
- snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
- shrinker->debugfs_id);
-
- entry = debugfs_rename(shrinker_debugfs_root,
- shrinker->debugfs_entry,
- shrinker_debugfs_root, buf);
- if (IS_ERR(entry))
- ret = PTR_ERR(entry);
- else
- shrinker->debugfs_entry = entry;
- }
+ ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d",
+ shrinker->name, shrinker->debugfs_id);
+ if (ret) {
+ shrinker->name = old;
+ kfree_const(new);
+ } else {
+ kfree_const(old);
+ }
mutex_unlock(&shrinker_mutex);
- kfree_const(old);
-
return ret;
}
EXPORT_SYMBOL(shrinker_debugfs_rename);
diff --git a/mm/slab.h b/mm/slab.h
index 632fedd71fea..0dd3f33c0963 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -50,7 +50,7 @@ typedef union {
/* Reuses the bits in struct page */
struct slab {
- unsigned long __page_flags;
+ unsigned long flags;
struct kmem_cache *slab_cache;
union {
@@ -99,7 +99,7 @@ struct slab {
#define SLAB_MATCH(pg, sl) \
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
-SLAB_MATCH(flags, __page_flags);
+SLAB_MATCH(flags, flags);
SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
@@ -128,7 +128,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
/**
* slab_folio - The folio allocated for a slab
- * @slab: The slab.
+ * @s: The slab.
*
* Slabs are allocated as folios that contain the individual objects and are
* using some fields in the first struct page of the folio - those fields are
@@ -159,7 +159,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
/**
* slab_page - The first struct page allocated for a slab
- * @slab: The slab.
+ * @s: The slab.
*
* A convenience wrapper for converting slab to the first struct page of the
* underlying folio, to communicate with code not yet converted to folio or
@@ -167,30 +167,6 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
*/
#define slab_page(s) folio_page(slab_folio(s), 0)
-/*
- * If network-based swap is enabled, sl*b must keep track of whether pages
- * were allocated from pfmemalloc reserves.
- */
-static inline bool slab_test_pfmemalloc(const struct slab *slab)
-{
- return folio_test_active(slab_folio(slab));
-}
-
-static inline void slab_set_pfmemalloc(struct slab *slab)
-{
- folio_set_active(slab_folio(slab));
-}
-
-static inline void slab_clear_pfmemalloc(struct slab *slab)
-{
- folio_clear_active(slab_folio(slab));
-}
-
-static inline void __slab_clear_pfmemalloc(struct slab *slab)
-{
- __folio_clear_active(slab_folio(slab));
-}
-
static inline void *slab_address(const struct slab *slab)
{
return folio_address(slab_folio(slab));
@@ -457,39 +433,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
-/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
- SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
-#else
-#define SLAB_DEBUG_FLAGS (0)
-#endif
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-
-/* Common flags available with current configuration */
-#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
-
-/* Common flags permitted for kmem_cache_create */
-#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
- SLAB_RED_ZONE | \
- SLAB_POISON | \
- SLAB_STORE_USER | \
- SLAB_TRACE | \
- SLAB_CONSISTENCY_CHECKS | \
- SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | \
- SLAB_ACCOUNT | \
- SLAB_KMALLOC | \
- SLAB_NO_MERGE | \
- SLAB_NO_USER_FLAGS)
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -572,8 +526,12 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
unsigned long obj_exts = READ_ONCE(slab->obj_exts);
#ifdef CONFIG_MEMCG
- VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
- slab_page(slab));
+ /*
+ * obj_exts should be either NULL, a valid pointer with
+ * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL.
+ */
+ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) &&
+ obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab));
VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
@@ -604,6 +562,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
#endif
+void kvfree_rcu_cb(struct rcu_head *head);
+
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a29457bef626..bfe7c40eeee1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -2,7 +2,7 @@
/*
* Slab allocator functions that are independent of the allocator strategy
*
- * (C) 2012 Christoph Lameter <cl@linux.com>
+ * (C) 2012 Christoph Lameter <cl@gentwo.org>
*/
#include <linux/slab.h>
@@ -28,7 +28,9 @@
#include <asm/page.h>
#include <linux/memcontrol.h>
#include <linux/stackdepot.h>
+#include <trace/events/rcu.h>
+#include "../kernel/rcu/rcu.h"
#include "internal.h"
#include "slab.h"
@@ -296,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
+#else
+ flags &= ~SLAB_DEBUG_FLAGS;
#endif
mutex_lock(&slab_mutex);
@@ -305,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
goto out_unlock;
}
- /* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
- /*
- * Some allocators will constraint the set of valid flags to a subset
- * of all flags. We expect them to define CACHE_CREATE_MASK in this
- * case, and we'll just provide them with a sanitized version of the
- * passed flags.
- */
- flags &= CACHE_CREATE_MASK;
-
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
WARN_ON(!args->usersize && args->useroffset) ||
@@ -1282,3 +1277,908 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head) {
+ kasan_record_aux_stack(ptr);
+ call_rcu(head, kvfree_rcu_cb);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree(ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+void __init kvfree_rcu_init(void)
+{
+}
+
+#else /* CONFIG_KVFREE_RCU_BATCHED */
+
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 5;
+module_param(rcu_min_cached_objs, int, 0444);
+
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
+static struct workqueue_struct *rcu_reclaim_wq;
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
+#define KFREE_N_BATCHES 2
+#define FREE_N_CHANNELS 2
+
+/**
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
+ * @nr_records: Number of active pointers in the array
+ * @records: Array of the kvfree_rcu() pointers
+ */
+struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ struct rcu_gp_oldstate gp_snap;
+ unsigned long nr_records;
+ void *records[] __counted_by(nr_records);
+};
+
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+ struct rcu_work rcu_work;
+ struct rcu_head *head_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
+ struct kfree_rcu_cpu *krcp;
+};
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @initialized: The @rcu_work fields have been initialized
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
+ * @bkvcache:
+ * A simple cache list that contains objects for reuse purpose.
+ * In order to save some per-cpu space the list is singular.
+ * Even though it is lockless an access has to be protected by the
+ * per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
+ * @nr_bkv_objs: number of allocated objects at @bkvcache.
+ *
+ * This is a per-CPU structure. The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files. Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
+ */
+struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
+ struct rcu_head *head;
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
+ struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+ raw_spinlock_t lock;
+ struct delayed_work monitor_work;
+ bool initialized;
+
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
+
+static __always_inline void
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
+#endif
+}
+
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
+ return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
+
+ if (!rcu_min_cached_objs)
+ return 0;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
+
+ return freed;
+}
+
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ "slab", bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ "slab", bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+ }
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback("slab", head, offset);
+
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+ unsigned long flags;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
+ struct kfree_rcu_cpu *krcp;
+ struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
+ int i;
+
+ krwp = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu_work, rcu_work);
+ krcp = krwp->krcp;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
+
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle the first two channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ /*
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
+ */
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
+}
+
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krcp->bulk_head[i]))
+ return true;
+
+ return !!READ_ONCE(krcp->head);
+}
+
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
+}
+
+static void
+__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+ return;
+ }
+ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ __schedule_delayed_monitor_work(krcp);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
+}
+
+/*
+ * Return: %true if a work is queued, %false otherwise.
+ */
+static bool
+kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ bool queued = false;
+ int i, j;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+
+ // Attempt to start a new batch.
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
+
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
+ }
+ }
+
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
+ if (!krwp->head_free) {
+ krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. Break
+ // the loop since it is done with this CPU thus
+ // queuing an RCU work is _always_ success here.
+ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
+ WARN_ON_ONCE(!queued);
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ return queued;
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
+
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
+
+ // Queue a batch for a rest.
+ kvfree_rcu_queue_batch(krcp);
+
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work.work);
+ unsigned long flags;
+ int nr_pages;
+ bool pushed;
+ int i;
+
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!bnode)
+ break;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
+}
+
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags. If can_alloc is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// Returns true if ptr was successfully recorded, else the caller must
+// use a fallback.
+static inline bool
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+ unsigned long *flags, void *ptr, bool can_alloc)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
+
+ *krcp = krc_this_cpu_lock(flags);
+ if (unlikely(!(*krcp)->initialized))
+ return false;
+
+ idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
+
+ /* Check if a new block is required. */
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(*krcp);
+ if (!bnode && can_alloc) {
+ krc_this_cpu_unlock(*krcp, *flags);
+
+ // __GFP_NORETRY - allows a light-weight direct reclaim
+ // what is OK from minimizing of fallback hitting point of
+ // view. Apart of that it forbids any OOM invoking what is
+ // also beneficial since we are about to release memory soon.
+ //
+ // __GFP_NOMEMALLOC - prevents from consuming of all the
+ // memory reserves. Please note we have a fallback path.
+ //
+ // __GFP_NOWARN - it is supposed that an allocation can
+ // be failed under low memory or high memory pressure
+ // scenarios.
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+ }
+
+ if (!bnode)
+ return false;
+
+ // Initialize the new block and attach it.
+ bnode->nr_records = 0;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
+ }
+
+ // Finally insert and update the GP for this page.
+ bnode->nr_records++;
+ bnode->records[bnode->nr_records - 1] = ptr;
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
+ atomic_inc(&(*krcp)->bulk_count[idx]);
+
+ return true;
+}
+
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
+ return HRTIMER_NORESTART;
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(rcu_reclaim_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+ }
+}
+
+void __init kfree_rcu_scheduler_running(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
+ }
+}
+
+/*
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
+ *
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
+ * be free'd in workqueue context. This allows us to: batch requests together to
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
+ */
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp;
+ bool success;
+
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
+ might_sleep();
+
+ // Queue the object but don't yet schedule the batch.
+ if (debug_rcu_head_queue(ptr)) {
+ // Probable double kfree_rcu(), just leak.
+ WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+ __func__, head);
+
+ // Mark as success and leave.
+ return;
+ }
+
+ kasan_record_aux_stack(ptr);
+ success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+ if (!success) {
+ run_page_cache_worker(krcp);
+
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
+ head->func = ptr;
+ head->next = krcp->head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
+ success = true;
+ }
+
+ /*
+ * The kvfree_rcu() caller considers the pointer freed at this point
+ * and likely removes any references to it. Since the actual slab
+ * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+ * this object (no scanning or false positives reporting).
+ */
+ kmemleak_ignore(ptr);
+
+ // Set timer to drain after KFREE_DRAIN_JIFFIES.
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+ __schedule_delayed_monitor_work(krcp);
+
+unlock_return:
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+ struct kfree_rcu_cpu_work *krwp;
+ struct kfree_rcu_cpu *krcp;
+ bool queued;
+ int i, cpu;
+
+ /*
+ * Firstly we detach objects and queue them over an RCU-batch
+ * for all CPUs. Finally queued works are flushed for each CPU.
+ *
+ * Please note. If there are outstanding batches for a particular
+ * CPU, those have to be finished first following by queuing a new.
+ */
+ for_each_possible_cpu(cpu) {
+ krcp = per_cpu_ptr(&krc, cpu);
+
+ /*
+ * Check if this CPU has any objects which have been queued for a
+ * new GP completion. If not(means nothing to detach), we are done
+ * with it. If any batch is pending/running for this "krcp", below
+ * per-cpu flush_rcu_work() waits its completion(see last step).
+ */
+ if (!need_offload_krc(krcp))
+ continue;
+
+ while (1) {
+ /*
+ * If we are not able to queue a new RCU work it means:
+ * - batches for this CPU are still in flight which should
+ * be flushed first and then repeat;
+ * - no objects to detach, because of concurrency.
+ */
+ queued = kvfree_rcu_queue_batch(krcp);
+
+ /*
+ * Bail out, if there is no need to offload this "krcp"
+ * anymore. As noted earlier it can run concurrently.
+ */
+ if (queued || !need_offload_krc(krcp))
+ break;
+
+ /* There are ongoing batches. */
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
+ flush_rcu_work(&krwp->rcu_work);
+ }
+ }
+ }
+
+ /*
+ * Now we guarantee that all objects are flushed.
+ */
+ for_each_possible_cpu(cpu) {
+ krcp = per_cpu_ptr(&krc, cpu);
+
+ /*
+ * A monitor work can drain ready to reclaim objects
+ * directly. Wait its completion if running or pending.
+ */
+ cancel_delayed_work_sync(&krcp->monitor_work);
+
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
+ flush_rcu_work(&krwp->rcu_work);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+
+static unsigned long
+kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count += krc_count(krcp);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
+ }
+
+ return count == 0 ? SHRINK_EMPTY : count;
+}
+
+static unsigned long
+kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu, freed = 0;
+
+ for_each_possible_cpu(cpu) {
+ int count;
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count = krc_count(krcp);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
+
+ sc->nr_to_scan -= count;
+ freed += count;
+
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ return freed == 0 ? SHRINK_STOP : freed;
+}
+
+void __init kvfree_rcu_init(void)
+{
+ int cpu;
+ int i, j;
+ struct shrinker *kfree_rcu_shrinker;
+
+ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_reclaim_wq);
+
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+ krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
+ }
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ krcp->initialized = true;
+ }
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
+}
+
+#endif /* CONFIG_KVFREE_RCU_BATCHED */
+
diff --git a/mm/slub.c b/mm/slub.c
index c2151c9fee22..c76ac34a1f51 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,9 +19,11 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
+#include <linux/node.h>
#include <linux/kmsan.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -90,14 +92,14 @@
* The partially empty slabs cached on the CPU partial list are used
* for performance reasons, which speeds up the allocation process.
* These slabs are not frozen, but are also exempt from list management,
- * by clearing the PG_workingset flag when moving out of the node
+ * by clearing the SL_partial flag when moving out of the node
* partial list. Please see __slab_free() for more details.
*
* To sum up, the current scheme is:
- * - node partial slab: PG_Workingset && !frozen
- * - cpu partial slab: !PG_Workingset && !frozen
- * - cpu slab: !PG_Workingset && frozen
- * - full slab: !PG_Workingset && !frozen
+ * - node partial slab: SL_partial && !frozen
+ * - cpu partial slab: !SL_partial && !frozen
+ * - cpu slab: !SL_partial && frozen
+ * - full slab: !SL_partial && !frozen
*
* list_lock
*
@@ -182,6 +184,22 @@
* the fast path and disables lockless freelists.
*/
+/**
+ * enum slab_flags - How the slab flags bits are used.
+ * @SL_locked: Is locked with slab_lock()
+ * @SL_partial: On the per-node partial list
+ * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
+ *
+ * The slab flags share space with the page flags but some bits have
+ * different interpretations. The high bits are used for information
+ * like zone/node/section.
+ */
+enum slab_flags {
+ SL_locked = PG_locked,
+ SL_partial = PG_workingset, /* Historical reasons for this bit */
+ SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
+};
+
/*
* We could simply use migrate_disable()/enable() but as long as it's a
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
@@ -446,7 +464,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
/*
* Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
- * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
+ * Corresponds to node_state[N_MEMORY], but can temporarily
* differ during memory hotplug/hotremove operations.
* Protected by slab_mutex.
*/
@@ -634,16 +652,35 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
#endif /* CONFIG_SLUB_CPU_PARTIAL */
/*
+ * If network-based swap is enabled, slub must keep track of whether memory
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+ return test_bit(SL_pfmemalloc, &slab->flags);
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+ set_bit(SL_pfmemalloc, &slab->flags);
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+ __clear_bit(SL_pfmemalloc, &slab->flags);
+}
+
+/*
* Per slab locking using the pagelock
*/
static __always_inline void slab_lock(struct slab *slab)
{
- bit_spin_lock(PG_locked, &slab->__page_flags);
+ bit_spin_lock(SL_locked, &slab->flags);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- bit_spin_unlock(PG_locked, &slab->__page_flags);
+ bit_spin_unlock(SL_locked, &slab->flags);
}
static inline bool
@@ -925,19 +962,19 @@ static struct track *get_track(struct kmem_cache *s, void *object,
}
#ifdef CONFIG_STACKDEPOT
-static noinline depot_stack_handle_t set_track_prepare(void)
+static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
depot_stack_handle_t handle;
unsigned long entries[TRACK_ADDRS_COUNT];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
- handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+ handle = stack_depot_save(entries, nr_entries, gfp_flags);
return handle;
}
#else
-static inline depot_stack_handle_t set_track_prepare(void)
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
return 0;
}
@@ -959,9 +996,9 @@ static void set_track_update(struct kmem_cache *s, void *object,
}
static __always_inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr)
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
{
- depot_stack_handle_t handle = set_track_prepare();
+ depot_stack_handle_t handle = set_track_prepare(gfp_flags);
set_track_update(s, object, alloc, addr, handle);
}
@@ -1009,7 +1046,7 @@ static void print_slab_info(const struct slab *slab)
{
pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
slab, slab->objects, slab->inuse, slab->freelist,
- &slab->__page_flags);
+ &slab->flags);
}
void skip_orig_size_check(struct kmem_cache *s, const void *object)
@@ -1017,22 +1054,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object)
set_orig_size(s, (void *)object, s->object_size);
}
-static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
+ va_copy(args, argsp);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
va_end(args);
}
+static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ __slab_bug(s, fmt, args);
+ va_end(args);
+}
+
__printf(2, 3)
-static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1085,19 +1131,24 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
-
- dump_stack();
}
static void object_err(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *reason)
+ u8 *object, const char *reason)
{
if (slab_add_kunit_errors())
return;
- slab_bug(s, "%s", reason);
- print_trailer(s, slab, object);
+ slab_bug(s, reason);
+ if (!object || !check_valid_pointer(s, slab, object)) {
+ print_slab_info(slab);
+ pr_err("Invalid pointer 0x%p\n", object);
+ } else {
+ print_trailer(s, slab, object);
+ }
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
}
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1114,22 +1165,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
return false;
}
+static void __slab_err(struct slab *slab)
+{
+ if (slab_in_kunit_test())
+ return;
+
+ print_slab_info(slab);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
+}
+
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
- char buf[100];
if (slab_add_kunit_errors())
return;
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ __slab_bug(s, fmt, args);
va_end(args);
- slab_bug(s, "%s", buf);
- print_slab_info(slab);
- dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ __slab_err(slab);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1166,7 +1225,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
s->inuse - poison_size);
}
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
void *from, void *to)
{
slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
@@ -1181,8 +1240,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static pad_check_attributes int
check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *what,
- u8 *start, unsigned int value, unsigned int bytes)
+ u8 *object, const char *what, u8 *start, unsigned int value,
+ unsigned int bytes, bool slab_obj_print)
{
u8 *fault;
u8 *end;
@@ -1201,10 +1260,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
if (slab_add_kunit_errors())
goto skip_bug_print;
- slab_bug(s, "%s overwritten", what);
- pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault - addr,
- fault[0], value);
+ pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ what, fault, end - 1, fault - addr, fault[0], value);
+
+ if (slab_obj_print)
+ object_err(s, slab, object, "Object corrupt");
skip_bug_print:
restore_bytes(s, what, value, fault, end);
@@ -1268,7 +1328,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
return 1;
return check_bytes_and_report(s, slab, p, "Object padding",
- p + off, POISON_INUSE, size_from_object(s) - off);
+ p + off, POISON_INUSE, size_from_object(s) - off, true);
}
/* Check the pad bytes at the end of a slab page */
@@ -1301,9 +1361,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
- fault, end - 1, fault - start);
+ slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
+ __slab_err(slab);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
}
@@ -1318,11 +1379,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
- object - s->red_left_pad, val, s->red_left_pad))
+ object - s->red_left_pad, val, s->red_left_pad, ret))
ret = 0;
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
- endobject, val, s->inuse - s->object_size))
+ endobject, val, s->inuse - s->object_size, ret))
ret = 0;
if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
@@ -1331,7 +1392,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->object_size > orig_size &&
!check_bytes_and_report(s, slab, object,
"kmalloc Redzone", p + orig_size,
- val, s->object_size - orig_size)) {
+ val, s->object_size - orig_size, ret)) {
ret = 0;
}
}
@@ -1339,7 +1400,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
if (!check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
- s->inuse - s->object_size))
+ s->inuse - s->object_size, ret))
ret = 0;
}
}
@@ -1355,11 +1416,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (kasan_meta_size < s->object_size - 1 &&
!check_bytes_and_report(s, slab, p, "Poison",
p + kasan_meta_size, POISON_FREE,
- s->object_size - kasan_meta_size - 1))
+ s->object_size - kasan_meta_size - 1, ret))
ret = 0;
if (kasan_meta_size < s->object_size &&
!check_bytes_and_report(s, slab, p, "End Poison",
- p + s->object_size - 1, POISON_END, 1))
+ p + s->object_size - 1, POISON_END, 1, ret))
ret = 0;
}
/*
@@ -1385,11 +1446,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
ret = 0;
}
- if (!ret && !slab_in_kunit_test()) {
- print_trailer(s, slab, object);
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- }
-
return ret;
}
@@ -1427,7 +1483,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
* Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
+static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
@@ -1437,26 +1493,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
fp = slab->freelist;
while (fp && nr <= slab->objects) {
if (fp == search)
- return 1;
+ return true;
if (!check_valid_pointer(s, slab, fp)) {
if (object) {
object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, slab, "Freepointer corrupt");
slab->freelist = NULL;
slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
- return 0;
+ return false;
}
- break;
}
object = fp;
fp = get_freepointer(s, object);
nr++;
}
+ if (nr > slab->objects) {
+ slab_err(s, slab, "Freelist cycle detected");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab_fix(s, "Freelist cleared");
+ return false;
+ }
+
max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1624,12 +1688,12 @@ static inline int free_consistency_checks(struct kmem_cache *s,
slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
} else if (!slab->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
- dump_stack();
- } else
+ slab_err(NULL, slab, "No slab cache for object 0x%p",
+ object);
+ } else {
object_err(s, slab, object,
- "page slab pointer corrupt.");
+ "page slab pointer corrupt.");
+ }
return 0;
}
return 1;
@@ -1862,9 +1926,9 @@ static inline bool free_debug_processing(struct kmem_cache *s,
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
static inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr) {}
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1906,15 +1970,19 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
if (slab_exts) {
unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
obj_exts_slab, obj_exts);
- /* codetag should be NULL */
+
+ if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
+ return;
+
+ /* codetag should be NULL here */
WARN_ON(slab_exts[offs].ref.ct);
set_codetag_empty(&slab_exts[offs].ref);
}
}
-static inline void mark_failed_objexts_alloc(struct slab *slab)
+static inline bool mark_failed_objexts_alloc(struct slab *slab)
{
- slab->obj_exts = OBJEXTS_ALLOC_FAIL;
+ return cmpxchg(&slab->obj_exts, 0, OBJEXTS_ALLOC_FAIL) == 0;
}
static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
@@ -1936,7 +2004,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
-static inline void mark_failed_objexts_alloc(struct slab *slab) {}
+static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; }
static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
struct slabobj_ext *vec, unsigned int objects) {}
@@ -1950,6 +2018,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+ slab->obj_exts = 0;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -1964,9 +2037,14 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
slab_nid(slab));
if (!vec) {
- /* Mark vectors which failed to allocate */
- if (new_slab)
- mark_failed_objexts_alloc(slab);
+ /*
+ * Try to mark vectors which failed to allocate.
+ * If this operation fails, there may be a racing process
+ * that has already completed the allocation.
+ */
+ if (!mark_failed_objexts_alloc(slab) &&
+ slab_obj_exts(slab))
+ return 0;
return -ENOMEM;
}
@@ -1975,6 +2053,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
#ifdef CONFIG_MEMCG
new_exts |= MEMCG_DATA_OBJEXTS;
#endif
+retry:
old_exts = READ_ONCE(slab->obj_exts);
handle_failed_objexts_alloc(old_exts, vec, objects);
if (new_slab) {
@@ -1984,8 +2063,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
* be simply assigned.
*/
slab->obj_exts = new_exts;
- } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) ||
- cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
+ } else if (old_exts & ~OBJEXTS_FLAGS_MASK) {
/*
* If the slab is already in use, somebody can allocate and
* assign slabobj_exts in parallel. In this case the existing
@@ -1994,6 +2072,9 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
mark_objexts_empty(vec);
kfree(vec);
return 0;
+ } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
+ /* Retry if a racing thread changed slab->obj_exts from under us. */
+ goto retry;
}
kmemleak_not_leak(vec);
@@ -2005,8 +2086,15 @@ static inline void free_slab_obj_exts(struct slab *slab)
struct slabobj_ext *obj_exts;
obj_exts = slab_obj_exts(slab);
- if (!obj_exts)
+ if (!obj_exts) {
+ /*
+ * If obj_exts allocation failed, slab->obj_exts is set to
+ * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should
+ * clear the flag.
+ */
+ slab->obj_exts = 0;
return;
+ }
/*
* obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
@@ -2020,20 +2108,12 @@ static inline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
-static inline bool need_slab_obj_ext(void)
-{
- if (mem_alloc_profiling_enabled())
- return true;
+#else /* CONFIG_SLAB_OBJ_EXT */
- /*
- * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
- * inside memcg_slab_post_alloc_hook. No other users for now.
- */
- return false;
+static inline void init_slab_obj_exts(struct slab *slab)
+{
}
-#else /* CONFIG_SLAB_OBJ_EXT */
-
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2044,11 +2124,6 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
-static inline bool need_slab_obj_ext(void)
-{
- return false;
-}
-
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
@@ -2069,41 +2144,46 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
- WARN(alloc_slab_obj_exts(slab, s, flags, false),
- "%s, %s: Failed to create slab extension vector!\n",
- __func__, s->name))
+ alloc_slab_obj_exts(slab, s, flags, false)) {
+ pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
+ __func__, s->name);
return NULL;
+ }
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
-static inline void
-alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void
+__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext()) {
- struct slabobj_ext *obj_exts;
+ struct slabobj_ext *obj_exts;
- obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
- /*
- * Currently obj_exts is used only for allocation profiling.
- * If other users appear then mem_alloc_profiling_enabled()
- * check should be added before alloc_tag_add().
- */
- if (likely(obj_exts))
- alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
- }
+ obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
+ /*
+ * Currently obj_exts is used only for allocation profiling.
+ * If other users appear then mem_alloc_profiling_enabled()
+ * check should be added before alloc_tag_add().
+ */
+ if (likely(obj_exts))
+ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
}
static inline void
-alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
- int objects)
+alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (mem_alloc_profiling_enabled())
+ __alloc_tagging_slab_alloc_hook(s, object, flags);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void
+__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
{
struct slabobj_ext *obj_exts;
int i;
- if (!mem_alloc_profiling_enabled())
- return;
-
/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
return;
@@ -2119,6 +2199,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
}
}
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
+{
+ if (mem_alloc_profiling_enabled())
+ __alloc_tagging_slab_free_hook(s, slab, p, objects);
+}
+
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void
@@ -2311,7 +2399,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
* We have to do this manually because the rcu_head is
* not located inside the object.
*/
- kasan_record_aux_stack_noalloc(x);
+ kasan_record_aux_stack(x);
delayed_free->object = x;
call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
@@ -2420,17 +2508,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
unsigned int order = oo_order(oo);
if (node == NUMA_NO_NODE)
- folio = (struct folio *)alloc_pages(flags, order);
+ folio = (struct folio *)alloc_frozen_pages(flags, order);
else
- folio = (struct folio *)__alloc_pages_node(node, flags, order);
+ folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
if (!folio)
return NULL;
slab = folio_slab(folio);
__folio_set_slab(folio);
- /* Make the flag visible before any changes to folio->mapping */
- smp_wmb();
if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
@@ -2558,8 +2644,12 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_online() || need_slab_obj_ext())
- free_slab_obj_exts(slab);
+ /*
+ * The slab object extensions should now be freed regardless of
+ * whether mem_alloc_profiling_enabled() or not because profiling
+ * might have been disabled after slab->obj_exts got allocated.
+ */
+ free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
@@ -2603,6 +2693,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
+ init_slab_obj_exts(slab);
account_slab(slab, oo_order(oo), s, flags);
@@ -2651,12 +2742,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
__slab_clear_pfmemalloc(slab);
folio->mapping = NULL;
- /* Make the mapping reset visible before clearing the flag */
- smp_wmb();
__folio_clear_slab(folio);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
- __free_pages(&folio->page, order);
+ free_frozen_pages(&folio->page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -2688,23 +2777,19 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
free_slab(s, slab);
}
-/*
- * SLUB reuses PG_workingset bit to keep track of whether it's on
- * the per-node partial list.
- */
static inline bool slab_test_node_partial(const struct slab *slab)
{
- return folio_test_workingset(slab_folio(slab));
+ return test_bit(SL_partial, &slab->flags);
}
static inline void slab_set_node_partial(struct slab *slab)
{
- set_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
+ set_bit(SL_partial, &slab->flags);
}
static inline void slab_clear_node_partial(struct slab *slab)
{
- clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
+ clear_bit(SL_partial, &slab->flags);
}
/*
@@ -3815,9 +3900,14 @@ new_objects:
* For debug caches here we had to go through
* alloc_single_from_partial() so just store the
* tracking info and return the object.
+ *
+ * Due to disabled preemption we need to disallow
+ * blocking. The flags are further adjusted by
+ * gfp_nested_mask() in stack_depot itself.
*/
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr,
+ gfpflags & ~(__GFP_DIRECT_RECLAIM));
return freelist;
}
@@ -3849,7 +3939,8 @@ new_objects:
goto new_objects;
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr,
+ gfpflags & ~(__GFP_DIRECT_RECLAIM));
return freelist;
}
@@ -4240,11 +4331,17 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
+
+ if (node == NUMA_NO_NODE)
+ folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
+ else
+ folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL);
+
if (folio) {
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
+ __folio_set_large_kmalloc(folio);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -4354,8 +4451,12 @@ static noinline void free_to_partial_list(
unsigned long flags;
depot_stack_handle_t handle = 0;
+ /*
+ * We cannot use GFP_NOWAIT as there are callsites where waking up
+ * kswapd could deadlock
+ */
if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
+ handle = set_track_prepare(__GFP_NOWARN);
spin_lock_irqsave(&n->list_lock, flags);
@@ -4720,6 +4821,11 @@ static void free_large_kmalloc(struct folio *folio, void *object)
{
unsigned int order = folio_order(folio);
+ if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
+ dump_page(&folio->page, "Not a kmalloc allocation");
+ return;
+ }
+
if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
@@ -4729,7 +4835,53 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
- folio_put(folio);
+ __folio_clear_large_kmalloc(folio);
+ free_frozen_pages(&folio->page, order);
+}
+
+/*
+ * Given an rcu_head embedded within an object obtained from kvmalloc at an
+ * offset < 4k, free the object in question.
+ */
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+ void *obj = head;
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *slab_addr;
+
+ if (is_vmalloc_addr(obj)) {
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ vfree(obj);
+ return;
+ }
+
+ folio = virt_to_folio(obj);
+ if (!folio_test_slab(folio)) {
+ /*
+ * rcu_head offset can be only less than page size so no need to
+ * consider folio order
+ */
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ free_large_kmalloc(folio, obj);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ slab_addr = folio_address(folio);
+
+ if (is_kfence_address(obj)) {
+ obj = kfence_object_start(obj);
+ } else {
+ unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+ obj = slab_addr + s->size * idx;
+ obj = fixup_red_left(s, obj);
+ }
+
+ slab_free(s, slab, obj, _RET_IP_);
}
/**
@@ -4849,12 +5001,12 @@ alloc_new:
* When slub_debug_orig_size() is off, krealloc() only knows about the bucket
* size of an allocation (but not the exact size it was allocated with) and
* hence implements the following semantics for shrinking and growing buffers
- * with __GFP_ZERO.
+ * with __GFP_ZERO::
*
- * new bucket
- * 0 size size
- * |--------|----------------|
- * | keep | zero |
+ * new bucket
+ * 0 size size
+ * |--------|----------------|
+ * | keep | zero |
*
* Otherwise, the original allocation size 'orig_size' could be used to
* precisely clear the requested size, and the new size will also be stored
@@ -4882,6 +5034,170 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
}
EXPORT_SYMBOL(krealloc_noprof);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - i.e.
+ * do not direct reclaim unless physically continuous memory is preferred
+ * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
+ * start working in the background
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags &= ~__GFP_DIRECT_RECLAIM;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
+/**
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
+ kmalloc_gfp_adjust(flags, size),
+ node, _RET_IP_);
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ void *n;
+
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(kvrealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5574,14 +5890,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
- const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
void *p;
- slab_err(s, slab, text, s->name);
+ if (!slab_add_kunit_errors())
+ slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
spin_lock(&object_map_lock);
__fill_map(object_map, s, slab);
@@ -5596,6 +5912,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
}
}
spin_unlock(&object_map_lock);
+
+ __slab_err(slab);
#endif
}
@@ -5616,8 +5934,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, slab,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ list_slab_objects(s, slab);
}
}
spin_unlock_irq(&n->list_lock);
@@ -5903,7 +6220,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return __kmem_cache_do_shrink(s);
}
-static int slab_mem_going_offline_callback(void *arg)
+static int slab_mem_going_offline_callback(void)
{
struct kmem_cache *s;
@@ -5917,46 +6234,13 @@ static int slab_mem_going_offline_callback(void *arg)
return 0;
}
-static void slab_mem_offline_callback(void *arg)
-{
- struct memory_notify *marg = arg;
- int offline_node;
-
- offline_node = marg->status_change_nid_normal;
-
- /*
- * If the node still has available memory. we need kmem_cache_node
- * for it yet.
- */
- if (offline_node < 0)
- return;
-
- mutex_lock(&slab_mutex);
- node_clear(offline_node, slab_nodes);
- /*
- * We no longer free kmem_cache_node structures here, as it would be
- * racy with all get_node() users, and infeasible to protect them with
- * slab_mutex.
- */
- mutex_unlock(&slab_mutex);
-}
-
-static int slab_mem_going_online_callback(void *arg)
+static int slab_mem_going_online_callback(int nid)
{
struct kmem_cache_node *n;
struct kmem_cache *s;
- struct memory_notify *marg = arg;
- int nid = marg->status_change_nid_normal;
int ret = 0;
/*
- * If the node's memory is already available, then kmem_cache_node is
- * already created. Nothing to do.
- */
- if (nid < 0)
- return 0;
-
- /*
* We are bringing a node online. No memory is available yet. We must
* allocate a kmem_cache_node structure in order to bring the node
* online.
@@ -5995,21 +6279,16 @@ out:
static int slab_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
+ struct node_notify *nn = arg;
+ int nid = nn->nid;
int ret = 0;
switch (action) {
- case MEM_GOING_ONLINE:
- ret = slab_mem_going_online_callback(arg);
- break;
- case MEM_GOING_OFFLINE:
- ret = slab_mem_going_offline_callback(arg);
+ case NODE_ADDING_FIRST_MEMORY:
+ ret = slab_mem_going_online_callback(nid);
break;
- case MEM_OFFLINE:
- case MEM_CANCEL_ONLINE:
- slab_mem_offline_callback(arg);
- break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
+ case NODE_REMOVING_LAST_MEMORY:
+ ret = slab_mem_going_offline_callback();
break;
}
if (ret)
@@ -6067,9 +6346,8 @@ void __init kmem_cache_init(void)
if (debug_guardpage_minorder())
slub_max_order = 0;
- /* Print slub debugging pointers without hashing */
- if (__slub_debug_enabled())
- no_hash_pointers_enable(NULL);
+ /* Inform pointer hashing choice about slub debugging state. */
+ hash_pointers_finalize(__slub_debug_enabled());
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
@@ -6078,14 +6356,14 @@ void __init kmem_cache_init(void)
* Initialize the nodemask for which we will allocate per node
* structures. Here we don't need taking slab_mutex yet.
*/
- for_each_node_state(node, N_NORMAL_MEMORY)
+ for_each_node_state(node, N_MEMORY)
node_set(node, slab_nodes);
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node),
SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+ hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
@@ -7472,10 +7750,7 @@ static int cmp_loc_by_count(const void *a, const void *b, const void *data)
struct location *loc1 = (struct location *)a;
struct location *loc2 = (struct location *)b;
- if (loc1->count > loc2->count)
- return -1;
- else
- return 1;
+ return cmp_int(loc2->count, loc1->count);
}
static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
@@ -7513,10 +7788,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
return -ENOMEM;
}
- if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
- alloc = TRACK_ALLOC;
- else
- alloc = TRACK_FREE;
+ alloc = debugfs_get_aux_num(filep);
if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
bitmap_free(obj_map);
@@ -7572,11 +7844,11 @@ static void debugfs_slab_add(struct kmem_cache *s)
slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
- debugfs_create_file("alloc_traces", 0400,
- slab_cache_dir, s, &slab_debugfs_fops);
+ debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
+ TRACK_ALLOC, &slab_debugfs_fops);
- debugfs_create_file("free_traces", 0400,
- slab_cache_dir, s, &slab_debugfs_fops);
+ debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
+ TRACK_FREE, &slab_debugfs_fops);
}
void debugfs_slab_release(struct kmem_cache *s)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index cec67c5f37d8..dbd8daccade2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,9 +27,20 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/pgalloc.h>
#include <asm/dma.h>
-#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * Flags for vmemmap_populate_range and friends.
+ */
+/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
+#define VMEMMAP_POPULATE_PAGEREF 0x0001
+
+#include "internal.h"
/*
* Allocate a block of memory to be used to back the virtual memory map
@@ -42,8 +53,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return memblock_alloc_try_nid_raw(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ return memmap_alloc(size, align, goal, node, false);
}
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
@@ -143,17 +153,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn, unsigned long flags)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
- if (!reuse) {
+ if (ptpfn == (unsigned long)-1) {
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
+ ptpfn = PHYS_PFN(__pa(p));
} else {
/*
* When a PTE/PMD entry is freed from the init_mm
@@ -164,10 +175,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
* and through vmemmap_populate_compound_pages() when
* slab is available.
*/
- get_page(reuse);
- p = page_to_virt(reuse);
+ if (flags & VMEMMAP_POPULATE_PAGEREF)
+ get_page(pfn_to_page(ptpfn));
}
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ entry = pfn_pte(ptpfn, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
return pte;
@@ -218,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
if (!p)
return NULL;
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
return p4d;
}
@@ -230,14 +241,15 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
- pgd_populate(&init_mm, pgd, p);
+ pgd_populate_kernel(addr, pgd, p);
}
return pgd;
}
static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn,
+ unsigned long flags)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -257,7 +269,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return NULL;
- pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
if (!pte)
return NULL;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -268,13 +280,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
static int __meminit vmemmap_populate_range(unsigned long start,
unsigned long end, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn,
+ unsigned long flags)
{
unsigned long addr = start;
pte_t *pte;
for (; addr < end; addr += PAGE_SIZE) {
- pte = vmemmap_populate_address(addr, node, altmap, reuse);
+ pte = vmemmap_populate_address(addr, node, altmap,
+ ptpfn, flags);
if (!pte)
return -ENOMEM;
}
@@ -285,7 +299,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap)
{
- return vmemmap_populate_range(start, end, node, altmap, NULL);
+ return vmemmap_populate_range(start, end, node, altmap, -1, 0);
+}
+
+/*
+ * Undo populate_hvo, and replace it with a normal base page mapping.
+ * Used in memory init in case a HVO mapping needs to be undone.
+ *
+ * This can happen when it is discovered that a memblock allocated
+ * hugetlb page spans multiple zones, which can only be verified
+ * after zones have been initialized.
+ *
+ * We know that:
+ * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
+ * allocated through memblock, and mapped.
+ *
+ * 2) The rest of the vmemmap pages are mirrors of the last head page.
+ */
+int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ unsigned long maddr, pfn;
+ pte_t *pte;
+ int headpages;
+
+ /*
+ * Should only be called early in boot, so nothing will
+ * be accessing these page structures.
+ */
+ WARN_ON(!early_boot_irqs_disabled);
+
+ headpages = headsize >> PAGE_SHIFT;
+
+ /*
+ * Clear mirrored mappings for tail page structs.
+ */
+ for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ pte_clear(&init_mm, maddr, pte);
+ }
+
+ /*
+ * Clear and free mappings for head page and first tail page
+ * structs.
+ */
+ for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ pfn = pte_pfn(ptep_get(pte));
+ pte_clear(&init_mm, maddr, pte);
+ memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
+ }
+
+ flush_tlb_kernel_range(addr, end);
+
+ return vmemmap_populate(addr, end, node, NULL);
+}
+
+/*
+ * Write protect the mirrored tail page structs for HVO. This will be
+ * called from the hugetlb code when gathering and initializing the
+ * memblock allocated gigantic pages. The write protect can't be
+ * done earlier, since it can't be guaranteed that the reserved
+ * page structures will not be written to during initialization,
+ * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
+ *
+ * The PTEs are known to exist, and nothing else should be touching
+ * these pages. The caller is responsible for any TLB flushing.
+ */
+void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ unsigned long maddr;
+ pte_t *pte;
+
+ for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ ptep_set_wrprotect(&init_mm, maddr, pte);
+ }
+}
+
+/*
+ * Populate vmemmap pages HVO-style. The first page contains the head
+ * page and needed tail pages, the other ones are mirrors of the first
+ * page.
+ */
+int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ pte_t *pte;
+ unsigned long maddr;
+
+ for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
+ pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
+ if (!pte)
+ return -ENOMEM;
+ }
+
+ /*
+ * Reuse the last page struct page mapped above for the rest.
+ */
+ return vmemmap_populate_range(maddr, end, node, NULL,
+ pte_pfn(ptep_get(pte)), 0);
}
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
@@ -408,7 +522,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
* with just tail struct pages.
*/
return vmemmap_populate_range(start, end, node, NULL,
- pte_page(ptep_get(pte)));
+ pte_pfn(ptep_get(pte)),
+ VMEMMAP_POPULATE_PAGEREF);
}
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -416,13 +531,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
unsigned long next, last = addr + size;
/* Populate the head page vmemmap page */
- pte = vmemmap_populate_address(addr, node, NULL, NULL);
+ pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
/* Populate the tail pages vmemmap page */
next = addr + PAGE_SIZE;
- pte = vmemmap_populate_address(next, node, NULL, NULL);
+ pte = vmemmap_populate_address(next, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
@@ -432,7 +547,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
- pte_page(ptep_get(pte)));
+ pte_pfn(ptep_get(pte)),
+ VMEMMAP_POPULATE_PAGEREF);
if (rc)
return -ENOMEM;
}
@@ -462,10 +578,30 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
if (r < 0)
return NULL;
- if (system_state == SYSTEM_BOOTING)
- memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
- else
- memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
-
return pfn_to_page(pfn);
}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+/*
+ * This is called just before initializing sections for a NUMA node.
+ * Any special initialization that needs to be done before the
+ * generic initialization can be done from here. Sections that
+ * are initialized in hooks called from here will be skipped by
+ * the generic initialization.
+ */
+void __init sparse_vmemmap_init_nid_early(int nid)
+{
+ hugetlb_vmemmap_init_early(nid);
+}
+
+/*
+ * This is called just before the initialization of page structures
+ * through memmap_init. Zones are now initialized, so any work that
+ * needs to be done that needs zone information can be done from
+ * here.
+ */
+void __init sparse_vmemmap_init_nid_late(int nid)
+{
+ hugetlb_vmemmap_init_late(nid);
+}
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 13b6624d3562..e6075b622407 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -170,11 +170,6 @@ static void __section_mark_present(struct mem_section *ms,
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-#define for_each_present_section_nr(start, section_nr) \
- for (section_nr = next_present_section_nr(start-1); \
- section_nr != -1; \
- section_nr = next_present_section_nr(section_nr))
-
static inline unsigned long first_present_section_nr(void)
{
return next_present_section_nr(-1);
@@ -257,10 +252,7 @@ static void __init memblocks_present(void)
size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc(size, align);
- if (!mem_section)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, size, align);
+ mem_section = memblock_alloc_or_panic(size, align);
}
#endif
@@ -411,13 +403,13 @@ static void __init check_usemap_section_nr(int nid,
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static unsigned long __init section_map_size(void)
+unsigned long __init section_map_size(void)
{
return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}
#else
-static unsigned long __init section_map_size(void)
+unsigned long __init section_map_size(void)
{
return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
@@ -462,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
*/
sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
- memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
-#endif
}
static void __init sparse_buffer_fini(void)
@@ -498,6 +487,44 @@ void __weak __meminit vmemmap_populate_print_last(void)
{
}
+static void *sparse_usagebuf __meminitdata;
+static void *sparse_usagebuf_end __meminitdata;
+
+/*
+ * Helper function that is used for generic section initialization, and
+ * can also be used by any hooks added above.
+ */
+void __init sparse_init_early_section(int nid, struct page *map,
+ unsigned long pnum, unsigned long flags)
+{
+ BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end);
+ check_usemap_section_nr(nid, sparse_usagebuf);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+ sparse_usagebuf, SECTION_IS_EARLY | flags);
+ sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size();
+}
+
+static int __init sparse_usage_init(int nid, unsigned long map_count)
+{
+ unsigned long size;
+
+ size = mem_section_usage_size() * map_count;
+ sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section(
+ NODE_DATA(nid), size);
+ if (!sparse_usagebuf) {
+ sparse_usagebuf_end = NULL;
+ return -ENOMEM;
+ }
+
+ sparse_usagebuf_end = sparse_usagebuf + size;
+ return 0;
+}
+
+static void __init sparse_usage_fini(void)
+{
+ sparse_usagebuf = sparse_usagebuf_end = NULL;
+}
+
/*
* Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
* And number of present sections in this node is map_count.
@@ -506,47 +533,56 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
- struct mem_section_usage *usage;
unsigned long pnum;
struct page *map;
+ struct mem_section *ms;
- usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
- mem_section_usage_size() * map_count);
- if (!usage) {
+ if (sparse_usage_init(nid, map_count)) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
+
sparse_buffer_init(map_count * section_map_size(), nid);
+
+ sparse_vmemmap_init_nid_early(nid);
+
for_each_present_section_nr(pnum_begin, pnum) {
unsigned long pfn = section_nr_to_pfn(pnum);
if (pnum >= pnum_end)
break;
- map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
- nid, NULL, NULL);
- if (!map) {
- pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
- __func__, nid);
- pnum_begin = pnum;
- sparse_buffer_fini();
- goto failed;
+ ms = __nr_to_section(pnum);
+ if (!preinited_vmemmap_section(ms)) {
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ sparse_usage_fini();
+ sparse_buffer_fini();
+ goto failed;
+ }
+ memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
+ PAGE_SIZE));
+ sparse_init_early_section(nid, map, pnum, 0);
}
- check_usemap_section_nr(nid, usage);
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
- SECTION_IS_EARLY);
- usage = (void *) usage + mem_section_usage_size();
}
+ sparse_usage_fini();
sparse_buffer_fini();
return;
failed:
- /* We failed to allocate, mark all the following pnums as not present */
+ /*
+ * We failed to allocate, mark all the following pnums as not present,
+ * except the ones already initialized earlier.
+ */
for_each_present_section_nr(pnum_begin, pnum) {
- struct mem_section *ms;
-
if (pnum >= pnum_end)
break;
ms = __nr_to_section(pnum);
+ if (!preinited_vmemmap_section(ms))
+ ms->section_mem_map = 0;
ms->section_mem_map = 0;
}
}
@@ -643,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
- memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
vmemmap_free(start, end, altmap);
}
static void free_map_bootmem(struct page *memmap)
@@ -819,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early)
+ if (!section_is_early) {
+ memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
depopulate_section_memmap(pfn, nr_pages, altmap);
- else if (memmap)
+ } else if (memmap) {
+ memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
+ PAGE_SIZE)));
free_map_bootmem(memmap);
+ }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -867,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
section_deactivate(pfn, nr_pages, altmap);
return ERR_PTR(-ENOMEM);
}
+ memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
diff --git a/mm/swap.c b/mm/swap.c
index 10decd9dffa1..b74ebe865dd9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -45,7 +45,7 @@
/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
-const int page_cluster_max = 31;
+static const int page_cluster_max = 31;
struct cpu_fbatches {
/*
@@ -109,7 +109,7 @@ void __folio_put(struct folio *folio)
page_cache_release(folio);
folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
- free_unref_page(&folio->page, folio_order(folio));
+ free_frozen_pages(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);
@@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
+ /* block memcg migration while the folio moves between lru */
+ if (move_fn != lru_add && !folio_test_clear_lru(folio))
+ continue;
+
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}
static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
- struct folio *folio, move_fn_t move_fn,
- bool on_lru, bool disable_irq)
+ struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
unsigned long flags;
- if (on_lru && !folio_test_clear_lru(folio))
- return;
-
folio_get(folio);
if (disable_irq)
@@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
else
local_lock(&cpu_fbatches.lock);
- if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
- lru_cache_disabled())
+ if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
if (disable_irq)
@@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
local_unlock(&cpu_fbatches.lock);
}
-#define folio_batch_add_and_move(folio, op, on_lru) \
- __folio_batch_add_and_move( \
- &cpu_fbatches.op, \
- folio, \
- op, \
- on_lru, \
- offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \
+#define folio_batch_add_and_move(folio, op) \
+ __folio_batch_add_and_move( \
+ &cpu_fbatches.op, \
+ folio, \
+ op, \
+ offsetof(struct cpu_fbatches, op) >= \
+ offsetof(struct cpu_fbatches, lock_irq) \
)
static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
@@ -231,14 +231,15 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
void folio_rotate_reclaimable(struct folio *folio)
{
if (folio_test_locked(folio) || folio_test_dirty(folio) ||
- folio_test_unevictable(folio))
+ folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_move_tail, true);
+ folio_batch_add_and_move(folio, lru_move_tail);
}
-void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated)
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
+ __releases(lruvec->lru_lock)
{
unsigned long cost;
@@ -250,18 +251,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
* different between them, adjust scan balance for CPU work.
*/
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+ if (!cost) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ return;
+ }
- do {
+ for (;;) {
unsigned long lrusize;
- /*
- * Hold lruvec->lru_lock is safe here, since
- * 1) The pinned lruvec in reclaim, or
- * 2) From a pre-LRU page during refault (which also holds the
- * rcu lock, so would be safe even if the page was on the LRU
- * and could move simultaneously to a new lruvec).
- */
- spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += cost;
@@ -285,14 +282,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
+
spin_unlock_irq(&lruvec->lru_lock);
- } while ((lruvec = parent_lruvec(lruvec)));
+ lruvec = parent_lruvec(lruvec);
+ if (!lruvec)
+ break;
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
void lru_note_cost_refault(struct folio *folio)
{
- lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio), 0);
+ struct lruvec *lruvec;
+
+ lruvec = folio_lruvec_lock_irq(folio);
+ lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
+ folio_nr_pages(folio), 0);
}
static void lru_activate(struct lruvec *lruvec, struct folio *folio)
@@ -309,7 +314,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio)
trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}
#ifdef CONFIG_SMP
@@ -323,10 +328,11 @@ static void folio_activate_drain(int cpu)
void folio_activate(struct folio *folio)
{
- if (folio_test_active(folio) || folio_test_unevictable(folio))
+ if (folio_test_active(folio) || folio_test_unevictable(folio) ||
+ !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_activate, true);
+ folio_batch_add_and_move(folio, lru_activate);
}
#else
@@ -379,37 +385,58 @@ static void __lru_cache_activate_folio(struct folio *folio)
}
#ifdef CONFIG_LRU_GEN
-static void folio_inc_refs(struct folio *folio)
+
+static void lru_gen_inc_refs(struct folio *folio)
{
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
if (folio_test_unevictable(folio))
return;
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- folio_set_referenced(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
- if (!folio_test_workingset(folio)) {
- folio_set_workingset(folio);
- return;
- }
-
- /* see the comment on MAX_NR_TIERS */
do {
- new_flags = old_flags & LRU_REFS_MASK;
- if (new_flags == LRU_REFS_MASK)
- break;
+ if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) {
+ if (!folio_test_workingset(folio))
+ folio_set_workingset(folio);
+ return;
+ }
- new_flags += BIT(LRU_REFS_PGOFF);
- new_flags |= old_flags & ~LRU_REFS_MASK;
+ new_flags = old_flags + BIT(LRU_REFS_PGOFF);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
}
-#else
-static void folio_inc_refs(struct folio *folio)
+
+static bool lru_gen_clear_refs(struct folio *folio)
{
+ struct lru_gen_folio *lrugen;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+
+ if (gen < 0)
+ return true;
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+
+ lrugen = &folio_lruvec(folio)->lrugen;
+ /* whether can do without shuffling under the LRU lock */
+ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type]));
}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_inc_refs(struct folio *folio)
+{
+}
+
+static bool lru_gen_clear_refs(struct folio *folio)
+{
+ return false;
+}
+
#endif /* CONFIG_LRU_GEN */
/**
@@ -427,8 +454,10 @@ static void folio_inc_refs(struct folio *folio)
*/
void folio_mark_accessed(struct folio *folio)
{
+ if (folio_test_dropbehind(folio))
+ return;
if (lru_gen_enabled()) {
- folio_inc_refs(folio);
+ lru_gen_inc_refs(folio);
return;
}
@@ -474,12 +503,12 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- /* see the comment in lru_gen_add_folio() */
+ /* see the comment in lru_gen_folio_seq() */
if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
- folio_batch_add_and_move(folio, lru_add, false);
+ folio_batch_add_and_move(folio, lru_add);
}
EXPORT_SYMBOL(folio_add_lru);
@@ -524,7 +553,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
*/
static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
{
- bool active = folio_test_active(folio);
+ bool active = folio_test_active(folio) || lru_gen_enabled();
long nr_pages = folio_nr_pages(folio);
if (folio_test_unevictable(folio))
@@ -558,7 +587,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
if (active) {
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
@@ -576,7 +605,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}
static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
@@ -589,7 +618,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
- folio_clear_referenced(folio);
+ if (lru_gen_enabled())
+ lru_gen_clear_refs(folio);
+ else
+ folio_clear_referenced(folio);
/*
* Lazyfree folios are clean anonymous folios. They have
* the swapbacked flag cleared, to distinguish them from normal
@@ -599,7 +631,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}
/*
@@ -654,10 +686,13 @@ void lru_add_drain_cpu(int cpu)
void deactivate_file_folio(struct folio *folio)
{
/* Deactivating an unevictable folio will not accelerate reclaim */
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_deactivate_file, true);
+ if (lru_gen_enabled() && lru_gen_clear_refs(folio))
+ return;
+
+ folio_batch_add_and_move(folio, lru_deactivate_file);
}
/*
@@ -670,10 +705,13 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
+ if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_deactivate, true);
+ if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
+ return;
+
+ folio_batch_add_and_move(folio, lru_deactivate);
}
/**
@@ -686,10 +724,11 @@ void folio_deactivate(struct folio *folio)
void folio_mark_lazyfree(struct folio *folio)
{
if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
+ !folio_test_lru(folio) ||
folio_test_swapcache(folio) || folio_test_unevictable(folio))
return;
- folio_batch_add_and_move(folio, lru_lazyfree, true);
+ folio_batch_add_and_move(folio, lru_lazyfree);
}
void lru_add_drain(void)
@@ -924,8 +963,6 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_folio_refs(folio, nr_refs))
- continue;
if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_folio(folio);
continue;
@@ -1044,6 +1081,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
+static const struct ctl_table swap_sysctl_table[] = {
+ {
+ .procname = "page-cluster",
+ .data = &page_cluster,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
+ }
+};
+
/*
* Perform any setup for the swap system
*/
@@ -1060,4 +1109,6 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+
+ register_sysctl_init("vm", swap_sysctl_table);
}
diff --git a/mm/swap.h b/mm/swap.h
index ad2f121de970..911ad5ff0f89 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,6 +3,9 @@
#define _MM_SWAP_H
struct mempolicy;
+struct swap_iocb;
+
+extern int page_cluster;
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
@@ -19,8 +22,8 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
-void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
+void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
@@ -50,7 +53,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
}
void show_swap_cache_info(void);
-bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp);
@@ -106,6 +108,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return find_next_bit(sis->zeromap, end, start) - start;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing mTHP swapin, we need to
+ * ensure all entries are not cached, otherwise, the mTHP folio will
+ * be in conflict with the folio in swap cache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
@@ -141,7 +162,8 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *folio,
+ struct swap_iocb **swap_plug)
{
return 0;
}
@@ -163,11 +185,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
return filemap_get_folio(mapping, index);
}
-static inline bool add_to_swap(struct folio *folio)
-{
- return false;
-}
-
static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
return NULL;
@@ -204,6 +221,28 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return 0;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to. If you know
+ * the folio is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+ if (unlikely(folio_test_swapcache(folio)))
+ return swap_cache_index(folio->swap);
+ return folio->index;
+}
+
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index da1278f0563b..de779fed8c21 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -6,149 +6,106 @@
#include <linux/swapops.h> /* depends on mm.h include */
static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
- struct page **map;
- unsigned long length;
- spinlock_t lock;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */
+#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short))
+#define ID_SHIFT (BITS_PER_TYPE(unsigned short))
+#define ID_MASK (BIT(ID_SHIFT) - 1)
struct swap_cgroup {
- unsigned short id;
+ atomic_t ids;
};
-#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
-
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- * - we have no race in "exchange" when we're accessed via SwapCache because
- * SwapCache(and its swp_entry) is under lock.
- * - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
- struct page *page;
- struct swap_cgroup_ctrl *ctrl;
- unsigned long idx, max;
-
- ctrl = &swap_cgroup_ctrl[type];
-
- for (idx = 0; idx < ctrl->length; idx++) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
- goto not_enough_page;
- ctrl->map[idx] = page;
- if (!(idx % SWAP_CLUSTER_MAX))
- cond_resched();
- }
- return 0;
-not_enough_page:
- max = idx;
- for (idx = 0; idx < max; idx++)
- __free_page(ctrl->map[idx]);
+struct swap_cgroup_ctrl {
+ struct swap_cgroup *map;
+};
- return -ENOMEM;
-}
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
- pgoff_t offset)
+static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
+ pgoff_t offset)
{
- struct page *mappage;
- struct swap_cgroup *sc;
+ unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
+ unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids);
+
+ BUILD_BUG_ON(!is_power_of_2(ID_PER_SC));
+ BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t));
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
+ return (old_ids >> shift) & ID_MASK;
}
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
- struct swap_cgroup_ctrl **ctrlp)
+static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
+ pgoff_t offset,
+ unsigned short new_id)
{
- pgoff_t offset = swp_offset(ent);
- struct swap_cgroup_ctrl *ctrl;
-
- ctrl = &swap_cgroup_ctrl[swp_type(ent)];
- if (ctrlp)
- *ctrlp = ctrl;
- return __lookup_swap_cgroup(ctrl, offset);
+ unsigned short old_id;
+ struct swap_cgroup *sc = &map[offset / ID_PER_SC];
+ unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
+ unsigned int new_ids, old_ids = atomic_read(&sc->ids);
+
+ do {
+ old_id = (old_ids >> shift) & ID_MASK;
+ new_ids = (old_ids & ~(ID_MASK << shift));
+ new_ids |= ((unsigned int)new_id) << shift;
+ } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids));
+
+ return old_id;
}
/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries.
+ * These entries must belong to one single folio, and that folio
+ * must be being charged for swap space (swap out), and these
+ * entries must not have been charged
*
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
+ * @folio: the folio that the swap entry belongs to
+ * @id: mem_cgroup ID to be recorded
+ * @ent: the first swap entry to be recorded
*/
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new)
+void swap_cgroup_record(struct folio *folio, unsigned short id,
+ swp_entry_t ent)
{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned long flags;
- unsigned short retval;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- retval = sc->id;
- if (retval == old)
- sc->id = new;
- else
- retval = 0;
- spin_unlock_irqrestore(&ctrl->lock, flags);
- return retval;
+ unsigned int nr_ents = folio_nr_pages(folio);
+ struct swap_cgroup *map;
+ pgoff_t offset, end;
+ unsigned short old;
+
+ offset = swp_offset(ent);
+ end = offset + nr_ents;
+ map = swap_cgroup_ctrl[swp_type(ent)].map;
+
+ do {
+ old = __swap_cgroup_id_xchg(map, offset, id);
+ VM_BUG_ON(old);
+ } while (++offset != end);
}
/**
- * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * swap_cgroup_clear - clear mem_cgroup for a set of swap entries.
+ * These entries must be being uncharged from swap. They either
+ * belongs to one single folio in the swap cache (swap in for
+ * cgroup v1), or no longer have any users (slot freeing).
+ *
* @ent: the first swap entry to be recorded into
- * @id: mem_cgroup to be recorded
* @nr_ents: number of swap entries to be recorded
*
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
+ * Returns the existing old value.
*/
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
- unsigned int nr_ents)
+unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned short old;
- unsigned long flags;
- pgoff_t offset = swp_offset(ent);
- pgoff_t end = offset + nr_ents;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- old = sc->id;
- for (;;) {
- VM_BUG_ON(sc->id != old);
- sc->id = id;
- offset++;
- if (offset == end)
- break;
- if (offset % SC_PER_PAGE)
- sc++;
- else
- sc = __lookup_swap_cgroup(ctrl, offset);
- }
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ pgoff_t offset, end;
+ struct swap_cgroup *map;
+ unsigned short old, iter = 0;
+
+ offset = swp_offset(ent);
+ end = offset + nr_ents;
+ map = swap_cgroup_ctrl[swp_type(ent)].map;
+
+ do {
+ old = __swap_cgroup_id_xchg(map, offset, 0);
+ if (!iter)
+ iter = old;
+ VM_BUG_ON(iter != old);
+ } while (++offset != end);
return old;
}
@@ -161,39 +118,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
+ struct swap_cgroup_ctrl *ctrl;
+
if (mem_cgroup_disabled())
return 0;
- return lookup_swap_cgroup(ent, NULL)->id;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent));
}
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
- void *array;
- unsigned long length;
+ struct swap_cgroup *map;
struct swap_cgroup_ctrl *ctrl;
if (mem_cgroup_disabled())
return 0;
- length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
-
- array = vcalloc(length, sizeof(void *));
- if (!array)
+ BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=
+ sizeof(struct swap_cgroup));
+ map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) *
+ sizeof(struct swap_cgroup));
+ if (!map)
goto nomem;
ctrl = &swap_cgroup_ctrl[type];
mutex_lock(&swap_cgroup_mutex);
- ctrl->length = length;
- ctrl->map = array;
- spin_lock_init(&ctrl->lock);
- if (swap_cgroup_prepare(type)) {
- /* memory shortage */
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
- vfree(array);
- goto nomem;
- }
+ ctrl->map = map;
mutex_unlock(&swap_cgroup_mutex);
return 0;
@@ -205,8 +156,7 @@ nomem:
void swap_cgroup_swapoff(int type)
{
- struct page **map;
- unsigned long i, length;
+ struct swap_cgroup *map;
struct swap_cgroup_ctrl *ctrl;
if (mem_cgroup_disabled())
@@ -215,19 +165,8 @@ void swap_cgroup_swapoff(int type)
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
- length = ctrl->length;
ctrl->map = NULL;
- ctrl->length = 0;
mutex_unlock(&swap_cgroup_mutex);
- if (map) {
- for (i = 0; i < length; i++) {
- struct page *page = map[i];
- if (page)
- __free_page(page);
- if (!(i % SWAP_CLUSTER_MAX))
- cond_resched();
- }
- vfree(map);
- }
+ vfree(map);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
deleted file mode 100644
index 13ab3b771409..000000000000
--- a/mm/swap_slots.c
+++ /dev/null
@@ -1,353 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Manage cache of swap slots to be used for and returned from
- * swap.
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * We allocate the swap slots from the global pool and put
- * it into local per cpu caches. This has the advantage
- * of no needing to acquire the swap_info lock every time
- * we need a new slot.
- *
- * There is also opportunity to simply return the slot
- * to local caches without needing to acquire swap_info
- * lock. We do not reuse the returned slots directly but
- * move them back to the global pool in a batch. This
- * allows the slots to coalesce and reduce fragmentation.
- *
- * The swap entry allocated is marked with SWAP_HAS_CACHE
- * flag in map_count that prevents it from being allocated
- * again from the global pool.
- *
- * The swap slots cache is protected by a mutex instead of
- * a spin lock as when we search for slots with scan_swap_map,
- * we can possibly sleep.
- */
-
-#include <linux/swap_slots.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-
-static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
-static bool swap_slot_cache_active;
-bool swap_slot_cache_enabled;
-static bool swap_slot_cache_initialized;
-static DEFINE_MUTEX(swap_slots_cache_mutex);
-/* Serialize swap slots cache enable/disable operations */
-static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
-
-static void __drain_swap_slots_cache(unsigned int type);
-
-#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
-#define SLOTS_CACHE 0x1
-#define SLOTS_CACHE_RET 0x2
-
-static void deactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = false;
- __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-static void reactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = true;
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-/* Must not be called with cpu hot plug lock */
-void disable_swap_slots_cache_lock(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- swap_slot_cache_enabled = false;
- if (swap_slot_cache_initialized) {
- /* serialize with cpu hotplug operations */
- cpus_read_lock();
- __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
- cpus_read_unlock();
- }
-}
-
-static void __reenable_swap_slots_cache(void)
-{
- swap_slot_cache_enabled = has_usable_swap();
-}
-
-void reenable_swap_slots_cache_unlock(void)
-{
- __reenable_swap_slots_cache();
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-static bool check_cache_active(void)
-{
- long pages;
-
- if (!swap_slot_cache_enabled)
- return false;
-
- pages = get_nr_swap_pages();
- if (!swap_slot_cache_active) {
- if (pages > num_online_cpus() *
- THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
- reactivate_swap_slots_cache();
- goto out;
- }
-
- /* if global pool of slot caches too low, deactivate cache */
- if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
- deactivate_swap_slots_cache();
-out:
- return swap_slot_cache_active;
-}
-
-static int alloc_swap_slot_cache(unsigned int cpu)
-{
- struct swap_slots_cache *cache;
- swp_entry_t *slots, *slots_ret;
-
- /*
- * Do allocation outside swap_slots_cache_mutex
- * as kvzalloc could trigger reclaim and folio_alloc_swap,
- * which can lock swap_slots_cache_mutex.
- */
- slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots)
- return -ENOMEM;
-
- slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots_ret) {
- kvfree(slots);
- return -ENOMEM;
- }
-
- mutex_lock(&swap_slots_cache_mutex);
- cache = &per_cpu(swp_slots, cpu);
- if (cache->slots || cache->slots_ret) {
- /* cache already allocated */
- mutex_unlock(&swap_slots_cache_mutex);
-
- kvfree(slots);
- kvfree(slots_ret);
-
- return 0;
- }
-
- if (!cache->lock_initialized) {
- mutex_init(&cache->alloc_lock);
- spin_lock_init(&cache->free_lock);
- cache->lock_initialized = true;
- }
- cache->nr = 0;
- cache->cur = 0;
- cache->n_ret = 0;
- /*
- * We initialized alloc_lock and free_lock earlier. We use
- * !cache->slots or !cache->slots_ret to know if it is safe to acquire
- * the corresponding lock and use the cache. Memory barrier below
- * ensures the assumption.
- */
- mb();
- cache->slots = slots;
- cache->slots_ret = slots_ret;
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
- bool free_slots)
-{
- struct swap_slots_cache *cache;
- swp_entry_t *slots = NULL;
-
- cache = &per_cpu(swp_slots, cpu);
- if ((type & SLOTS_CACHE) && cache->slots) {
- mutex_lock(&cache->alloc_lock);
- swapcache_free_entries(cache->slots + cache->cur, cache->nr);
- cache->cur = 0;
- cache->nr = 0;
- if (free_slots && cache->slots) {
- kvfree(cache->slots);
- cache->slots = NULL;
- }
- mutex_unlock(&cache->alloc_lock);
- }
- if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
- spin_lock_irq(&cache->free_lock);
- swapcache_free_entries(cache->slots_ret, cache->n_ret);
- cache->n_ret = 0;
- if (free_slots && cache->slots_ret) {
- slots = cache->slots_ret;
- cache->slots_ret = NULL;
- }
- spin_unlock_irq(&cache->free_lock);
- kvfree(slots);
- }
-}
-
-static void __drain_swap_slots_cache(unsigned int type)
-{
- unsigned int cpu;
-
- /*
- * This function is called during
- * 1) swapoff, when we have to make sure no
- * left over slots are in cache when we remove
- * a swap device;
- * 2) disabling of swap slot cache, when we run low
- * on swap slots when allocating memory and need
- * to return swap slots to global pool.
- *
- * We cannot acquire cpu hot plug lock here as
- * this function can be invoked in the cpu
- * hot plug path:
- * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
- * -> memory allocation -> direct reclaim -> folio_alloc_swap
- * -> drain_swap_slots_cache
- *
- * Hence the loop over current online cpu below could miss cpu that
- * is being brought online but not yet marked as online.
- * That is okay as we do not schedule and run anything on a
- * cpu before it has been marked online. Hence, we will not
- * fill any swap slots in slots cache of such cpu.
- * There are no slots on such cpu that need to be drained.
- */
- for_each_online_cpu(cpu)
- drain_slots_cache_cpu(cpu, type, false);
-}
-
-static int free_slot_cache(unsigned int cpu)
-{
- mutex_lock(&swap_slots_cache_mutex);
- drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-void enable_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- if (!swap_slot_cache_initialized) {
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
-
- swap_slot_cache_initialized = true;
- }
-
- __reenable_swap_slots_cache();
-out_unlock:
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-/* called with swap slot cache's alloc lock held */
-static int refill_swap_slots_cache(struct swap_slots_cache *cache)
-{
- if (!use_swap_slot_cache)
- return 0;
-
- cache->cur = 0;
- if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
- cache->slots, 0);
-
- return cache->nr;
-}
-
-void free_swap_slot(swp_entry_t entry)
-{
- struct swap_slots_cache *cache;
-
- /* Large folio swap slot is not covered. */
- zswap_invalidate(entry);
-
- cache = raw_cpu_ptr(&swp_slots);
- if (likely(use_swap_slot_cache && cache->slots_ret)) {
- spin_lock_irq(&cache->free_lock);
- /* Swap slots cache may be deactivated before acquiring lock */
- if (!use_swap_slot_cache || !cache->slots_ret) {
- spin_unlock_irq(&cache->free_lock);
- goto direct_free;
- }
- if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
- /*
- * Return slots to global pool.
- * The current swap_map value is SWAP_HAS_CACHE.
- * Set it to 0 to indicate it is available for
- * allocation in global pool
- */
- swapcache_free_entries(cache->slots_ret, cache->n_ret);
- cache->n_ret = 0;
- }
- cache->slots_ret[cache->n_ret++] = entry;
- spin_unlock_irq(&cache->free_lock);
- } else {
-direct_free:
- swapcache_free_entries(&entry, 1);
- }
-}
-
-swp_entry_t folio_alloc_swap(struct folio *folio)
-{
- swp_entry_t entry;
- struct swap_slots_cache *cache;
-
- entry.val = 0;
-
- if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, &entry, folio_order(folio));
- goto out;
- }
-
- /*
- * Preemption is allowed here, because we may sleep
- * in refill_swap_slots_cache(). But it is safe, because
- * accesses to the per-CPU data structure are protected by the
- * mutex cache->alloc_lock.
- *
- * The alloc path here does not touch cache->slots_ret
- * so cache->free_lock is not taken.
- */
- cache = raw_cpu_ptr(&swp_slots);
-
- if (likely(check_cache_active() && cache->slots)) {
- mutex_lock(&cache->alloc_lock);
- if (cache->slots) {
-repeat:
- if (cache->nr) {
- entry = cache->slots[cache->cur];
- cache->slots[cache->cur++].val = 0;
- cache->nr--;
- } else if (refill_swap_slots_cache(cache)) {
- goto repeat;
- }
- }
- mutex_unlock(&cache->alloc_lock);
- if (entry.val)
- goto out;
- }
-
- get_swap_pages(1, &entry, 0);
-out:
- if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_folio(folio, entry);
- entry.val = 0;
- }
- return entry;
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0c0321b8ff7..30a5739b2c51 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -20,7 +20,6 @@
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
-#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
@@ -31,7 +30,6 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
@@ -85,7 +83,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
/*
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and private instead of mapping and index.
+ * but sets SwapCache flag and 'swap' instead of mapping and index.
*/
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
@@ -167,67 +165,6 @@ void __delete_from_swap_cache(struct folio *folio,
__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
-/**
- * add_to_swap - allocate swap space for a folio
- * @folio: folio we want to move to swap
- *
- * Allocate swap space for the folio and add the folio to the
- * swap cache.
- *
- * Context: Caller needs to hold the folio lock.
- * Return: Whether the folio was added to the swap cache.
- */
-bool add_to_swap(struct folio *folio)
-{
- swp_entry_t entry;
- int err;
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
-
- entry = folio_alloc_swap(folio);
- if (!entry.val)
- return false;
-
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache.
- */
- err = add_to_swap_cache(folio, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
- if (err)
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- goto fail;
- /*
- * Normally the folio will be dirtied in unmap because its
- * pte should be dirty. A special case is MADV_FREE page. The
- * page's pte could have dirty bit cleared but the folio's
- * SwapBacked flag is still set because clearing the dirty bit
- * and SwapBacked flag has no lock protected. For such folio,
- * unmap will not set dirty bit for it, so folio reclaim will
- * not write the folio out. This can cause data corruption when
- * the folio is swapped in later. Always setting the dirty flag
- * for the folio solves the problem.
- */
- folio_mark_dirty(folio);
-
- return true;
-
-fail:
- put_swap_folio(folio, entry);
- return false;
-}
-
/*
* This must be called only on folios that have
* been verified to be in the swap cache and locked.
@@ -270,9 +207,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
- curr >>= SWAP_ADDRESS_SPACE_SHIFT;
- curr++;
- curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
if (curr > end)
break;
}
@@ -296,13 +231,11 @@ void free_swap_cache(struct folio *folio)
}
/*
- * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * Freeing a folio and also freeing any swap cache associated with
+ * this folio if it is the last user.
*/
-void free_page_and_swap_cache(struct page *page)
+void free_folio_and_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
free_swap_cache(folio);
if (!is_huge_zero_folio(folio))
folio_put(folio);
@@ -317,7 +250,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
@@ -433,17 +365,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct folio *folio;
struct folio *new_folio = NULL;
struct folio *result = NULL;
void *shadow = NULL;
*new_page_allocated = false;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
-
for (;;) {
int err;
/*
@@ -458,13 +386,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Just skip read ahead for unused swap slot.
- * During swap_off when swap_slot_cache is disabled,
- * we have to handle the race between putting
- * swap entry in swap cache and marking swap slot
- * as SWAP_HAS_CACHE. That's done in later part of code or
- * else swap_off will be aborted if we return NULL.
*/
- if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ if (!swap_entry_swapped(si, entry))
goto put_and_return;
/*
@@ -522,7 +445,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- mem_cgroup_swapin_uncharge_swap(entry, 1);
+ memcg1_swapin(entry, 1);
if (shadow)
workingset_refault(new_folio, shadow);
@@ -539,7 +462,6 @@ fail_unlock:
put_swap_folio(new_folio, entry);
folio_unlock(new_folio);
put_and_return:
- put_swap_device(si);
if (!(*new_page_allocated) && new_folio)
folio_put(new_folio);
return result;
@@ -559,11 +481,16 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug)
{
+ struct swap_info_struct *si;
bool page_allocated;
struct mempolicy *mpol;
pgoff_t ilx;
struct folio *folio;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
+
mpol = get_vma_policy(vma, addr, 0, &ilx);
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
@@ -571,6 +498,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (page_allocated)
swap_read_folio(folio, plug);
+
+ put_swap_device(si);
return folio;
}
@@ -817,6 +746,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
+ struct swap_info_struct *si = NULL;
+
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -830,8 +761,19 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
continue;
pte_unmap(pte);
pte = NULL;
+ /*
+ * Readahead entry may come from a device that we are not
+ * holding a reference to, try to grab a reference, or skip.
+ */
+ if (swp_type(entry) != swp_type(targ_entry)) {
+ si = get_swap_device(entry);
+ if (!si)
+ continue;
+ }
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
+ if (si)
+ put_swap_device(si);
if (!folio)
continue;
if (page_allocated) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0a9071cfe1d..73065d75d0e1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,7 +37,6 @@
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
-#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
@@ -53,15 +52,15 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
- unsigned int nr_pages);
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
+static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
-static struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si, unsigned long offset);
-static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
- struct swap_cluster_info *ci);
+static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset);
+static inline void unlock_cluster(struct swap_cluster_info *ci);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -116,6 +115,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+struct percpu_swap_cluster {
+ struct swap_info_struct *si[SWAP_NR_ORDERS];
+ unsigned long offset[SWAP_NR_ORDERS];
+ local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+ .si = { NULL },
+ .offset = { SWAP_ENTRY_INVALID },
+ .lock = INIT_LOCAL_LOCK(),
+};
+
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
if (type >= MAX_SWAPFILES)
@@ -129,6 +140,26 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/*
+ * Use the second highest bit of inuse_pages counter as the indicator
+ * if one swap device is on the available plist, so the atomic can
+ * still be updated arithmetically while having special data embedded.
+ *
+ * inuse_pages counter is the only thing indicating if a device should
+ * be on avail_lists or not (except swapon / swapoff). By embedding the
+ * off-list bit in the atomic counter, updates no longer need any lock
+ * to check the list status.
+ *
+ * This bit will be set if the device is not on the plist and not
+ * usable, will be cleared if the device is on the plist.
+ */
+#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
+#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
+static long swap_usage_in_pages(struct swap_info_struct *si)
+{
+ return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
+}
+
/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY 0x1
/*
@@ -138,10 +169,8 @@ static inline unsigned char swap_count(unsigned char ent)
#define TTRS_UNMAPPED 0x2
/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL 0x4
-/* Reclaim directly, bypass the slot cache and don't touch device lock */
-#define TTRS_DIRECT 0x8
-static bool swap_is_has_cache(struct swap_info_struct *si,
+static bool swap_only_has_cache(struct swap_info_struct *si,
unsigned long offset, int nr_pages)
{
unsigned char *map = si->swap_map + offset;
@@ -163,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
unsigned char count = *map;
- if (swap_count(count) != 1)
+ if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
return false;
while (++map < map_end) {
@@ -190,6 +219,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
int ret, nr_pages;
bool need_reclaim;
+again:
folio = filemap_get_folio(address_space, swap_cache_index(entry));
if (IS_ERR(folio))
return 0;
@@ -207,8 +237,16 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
if (!folio_trylock(folio))
goto out;
- /* offset could point to the middle of a large folio */
+ /*
+ * Offset could point to the middle of a large folio, or folio
+ * may no longer point to the expected offset before it's locked.
+ */
entry = folio->swap;
+ if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
offset = swp_offset(entry);
need_reclaim = ((flags & TTRS_ANYWAY) ||
@@ -222,32 +260,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* swap_map is HAS_CACHE only, which means the slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
- ci = lock_cluster_or_swap_info(si, offset);
- need_reclaim = swap_is_has_cache(si, offset, nr_pages);
- unlock_cluster_or_swap_info(si, ci);
+ ci = lock_cluster(si, offset);
+ need_reclaim = swap_only_has_cache(si, offset, nr_pages);
+ unlock_cluster(ci);
if (!need_reclaim)
goto out_unlock;
- if (!(flags & TTRS_DIRECT)) {
- /* Free through slot cache */
- delete_from_swap_cache(folio);
- folio_set_dirty(folio);
- ret = nr_pages;
- goto out_unlock;
- }
-
- xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
- folio_ref_sub(folio, nr_pages);
+ delete_from_swap_cache(folio);
folio_set_dirty(folio);
-
- spin_lock(&si->lock);
- /* Only sinple page folio can be backed by zswap */
- if (nr_pages == 1)
- zswap_invalidate(entry);
- swap_entry_range_free(si, entry, nr_pages);
- spin_unlock(&si->lock);
ret = nr_pages;
out_unlock:
folio_unlock(folio);
@@ -382,9 +402,23 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#endif
#define LATENCY_LIMIT 256
-static inline bool cluster_is_free(struct swap_cluster_info *info)
+static inline bool cluster_is_empty(struct swap_cluster_info *info)
+{
+ return info->count == 0;
+}
+
+static inline bool cluster_is_discard(struct swap_cluster_info *info)
+{
+ return info->flags == CLUSTER_FLAG_DISCARD;
+}
+
+static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
- return info->flags & CLUSTER_FLAG_FREE;
+ if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
+ return false;
+ if (!order)
+ return true;
+ return cluster_is_empty(ci) || order == ci->order;
}
static inline unsigned int cluster_index(struct swap_info_struct *si,
@@ -393,6 +427,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info;
}
+static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ return &si->cluster_info[offset / SWAPFILE_CLUSTER];
+}
+
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
@@ -404,105 +444,134 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si
{
struct swap_cluster_info *ci;
- ci = si->cluster_info;
- if (ci) {
- ci += offset / SWAPFILE_CLUSTER;
- spin_lock(&ci->lock);
- }
+ ci = offset_to_cluster(si, offset);
+ spin_lock(&ci->lock);
+
return ci;
}
static inline void unlock_cluster(struct swap_cluster_info *ci)
{
- if (ci)
- spin_unlock(&ci->lock);
+ spin_unlock(&ci->lock);
}
-/*
- * Determine the locking method in use for this device. Return
- * swap_cluster_info if SSD-style cluster-based locking is in place.
- */
-static inline struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si, unsigned long offset)
+static void move_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, struct list_head *list,
+ enum swap_cluster_flags new_flags)
{
- struct swap_cluster_info *ci;
+ VM_WARN_ON(ci->flags == new_flags);
- /* Try to use fine-grained SSD-style locking if available: */
- ci = lock_cluster(si, offset);
- /* Otherwise, fall back to traditional, coarse locking: */
- if (!ci)
- spin_lock(&si->lock);
-
- return ci;
-}
+ BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
+ lockdep_assert_held(&ci->lock);
-static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
- struct swap_cluster_info *ci)
-{
- if (ci)
- unlock_cluster(ci);
+ spin_lock(&si->lock);
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ list_add_tail(&ci->list, list);
else
- spin_unlock(&si->lock);
+ list_move_tail(&ci->list, list);
+ spin_unlock(&si->lock);
+
+ if (ci->flags == CLUSTER_FLAG_FRAG)
+ atomic_long_dec(&si->frag_cluster_nr[ci->order]);
+ else if (new_flags == CLUSTER_FLAG_FRAG)
+ atomic_long_inc(&si->frag_cluster_nr[ci->order]);
+ ci->flags = new_flags;
}
/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
- unsigned int idx = cluster_index(si, ci);
- /*
- * If scan_swap_map_slots() can't find a free cluster, it will check
- * si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
- * It will be cleared after discard
- */
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- SWAP_MAP_BAD, SWAPFILE_CLUSTER);
-
- VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
- list_move_tail(&ci->list, &si->discard_clusters);
- ci->flags = 0;
+ VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
+ move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
schedule_work(&si->discard_work);
}
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&si->lock);
lockdep_assert_held(&ci->lock);
-
- if (ci->flags)
- list_move_tail(&ci->list, &si->free_clusters);
- else
- list_add_tail(&ci->list, &si->free_clusters);
- ci->flags = CLUSTER_FLAG_FREE;
+ move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
/*
+ * Isolate and lock the first cluster that is not contented on a list,
+ * clean its flag before taken off-list. Cluster flag must be in sync
+ * with list status, so cluster updaters can always know the cluster
+ * list status without touching si lock.
+ *
+ * Note it's possible that all clusters on a list are contented so
+ * this returns NULL for an non-empty list.
+ */
+static struct swap_cluster_info *isolate_lock_cluster(
+ struct swap_info_struct *si, struct list_head *list)
+{
+ struct swap_cluster_info *ci, *ret = NULL;
+
+ spin_lock(&si->lock);
+
+ if (unlikely(!(si->flags & SWP_WRITEOK)))
+ goto out;
+
+ list_for_each_entry(ci, list, list) {
+ if (!spin_trylock(&ci->lock))
+ continue;
+
+ /* We may only isolate and clear flags of following lists */
+ VM_BUG_ON(!ci->flags);
+ VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
+ ci->flags != CLUSTER_FLAG_FULL);
+
+ list_del(&ci->list);
+ ci->flags = CLUSTER_FLAG_NONE;
+ ret = ci;
+ break;
+ }
+out:
+ spin_unlock(&si->lock);
+
+ return ret;
+}
+
+/*
* Doing discard actually. After a cluster discard is finished, the cluster
- * will be added to free cluster list. caller should hold si->lock.
-*/
-static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ * will be added to free cluster list. Discard cluster is a bit special as
+ * they don't participate in allocation or reclaim, so clusters marked as
+ * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
+ */
+static bool swap_do_scheduled_discard(struct swap_info_struct *si)
{
struct swap_cluster_info *ci;
+ bool ret = false;
unsigned int idx;
+ spin_lock(&si->lock);
while (!list_empty(&si->discard_clusters)) {
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
+ /*
+ * Delete the cluster from list to prepare for discard, but keep
+ * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
+ * pointing to it, or ran into by relocate_cluster.
+ */
list_del(&ci->list);
idx = cluster_index(si, ci);
spin_unlock(&si->lock);
-
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
SWAPFILE_CLUSTER);
- spin_lock(&si->lock);
spin_lock(&ci->lock);
+ /*
+ * Discard is done, clear its flags as it's off-list, then
+ * return the cluster to allocation list.
+ */
+ ci->flags = CLUSTER_FLAG_NONE;
__free_cluster(si, ci);
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- 0, SWAPFILE_CLUSTER);
spin_unlock(&ci->lock);
+ ret = true;
+ spin_lock(&si->lock);
}
+ spin_unlock(&si->lock);
+ return ret;
}
static void swap_discard_work(struct work_struct *work)
@@ -511,9 +580,7 @@ static void swap_discard_work(struct work_struct *work)
si = container_of(work, struct swap_info_struct, discard_work);
- spin_lock(&si->lock);
swap_do_scheduled_discard(si);
- spin_unlock(&si->lock);
}
static void swap_users_ref_free(struct percpu_ref *ref)
@@ -524,15 +591,16 @@ static void swap_users_ref_free(struct percpu_ref *ref)
complete(&si->comp);
}
+/*
+ * Must be called after freeing if ci->count == 0, moves the cluster to free
+ * or discard list.
+ */
static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
VM_BUG_ON(ci->count != 0);
- lockdep_assert_held(&si->lock);
+ VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
lockdep_assert_held(&ci->lock);
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
-
/*
* If the swap is discardable, prepare discard the cluster
* instead of free it immediately. The cluster will be freed
@@ -548,6 +616,49 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
}
/*
+ * Must be called after freeing if ci->count != 0, moves the cluster to
+ * nonfull list.
+ */
+static void partial_free_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
+ lockdep_assert_held(&ci->lock);
+
+ if (ci->flags != CLUSTER_FLAG_NONFULL)
+ move_cluster(si, ci, &si->nonfull_clusters[ci->order],
+ CLUSTER_FLAG_NONFULL);
+}
+
+/*
+ * Must be called after allocation, moves the cluster to full or frag list.
+ * Note: allocation doesn't acquire si lock, and may drop the ci lock for
+ * reclaim, so the cluster could be any where when called.
+ */
+static void relocate_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ lockdep_assert_held(&ci->lock);
+
+ /* Discard cluster must remain off-list or on discard list */
+ if (cluster_is_discard(ci))
+ return;
+
+ if (!ci->count) {
+ if (ci->flags != CLUSTER_FLAG_FREE)
+ free_cluster(si, ci);
+ } else if (ci->count != SWAPFILE_CLUSTER) {
+ if (ci->flags != CLUSTER_FLAG_FRAG)
+ move_cluster(si, ci, &si->frag_clusters[ci->order],
+ CLUSTER_FLAG_FRAG);
+ } else {
+ if (ci->flags != CLUSTER_FLAG_FULL)
+ move_cluster(si, ci, &si->full_clusters,
+ CLUSTER_FLAG_FULL);
+ }
+}
+
+/*
* The cluster corresponding to page_nr will be used. The cluster will not be
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
@@ -558,9 +669,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si,
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
struct swap_cluster_info *ci;
- if (!cluster_info)
- return;
-
ci = cluster_info + idx;
ci->count++;
@@ -568,63 +676,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si,
VM_BUG_ON(ci->flags);
}
-/*
- * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
- * which means no page in the cluster is in use, we can optionally discard
- * the cluster and add it to free cluster list.
- */
-static void dec_cluster_info_page(struct swap_info_struct *si,
- struct swap_cluster_info *ci, int nr_pages)
-{
- if (!si->cluster_info)
- return;
-
- VM_BUG_ON(ci->count < nr_pages);
- VM_BUG_ON(cluster_is_free(ci));
- lockdep_assert_held(&si->lock);
- lockdep_assert_held(&ci->lock);
- ci->count -= nr_pages;
-
- if (!ci->count) {
- free_cluster(si, ci);
- return;
- }
-
- if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
- VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
- list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]);
- ci->flags = CLUSTER_FLAG_NONFULL;
- }
-}
-
static bool cluster_reclaim_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
unsigned long start, unsigned long end)
{
unsigned char *map = si->swap_map;
- unsigned long offset;
+ unsigned long offset = start;
+ int nr_reclaim;
spin_unlock(&ci->lock);
- spin_unlock(&si->lock);
-
- for (offset = start; offset < end; offset++) {
+ do {
switch (READ_ONCE(map[offset])) {
case 0:
- continue;
+ offset++;
+ break;
case SWAP_HAS_CACHE:
- if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
- continue;
- goto out;
+ nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ if (nr_reclaim > 0)
+ offset += nr_reclaim;
+ else
+ goto out;
+ break;
default:
goto out;
}
- }
+ } while (offset < end);
out:
- spin_lock(&si->lock);
spin_lock(&ci->lock);
-
/*
* Recheck the range no matter reclaim succeeded or not, the slot
* could have been be freed while we are not holding the lock.
@@ -638,11 +716,14 @@ out:
static bool cluster_scan_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long start, unsigned int nr_pages)
+ unsigned long start, unsigned int nr_pages,
+ bool *need_reclaim)
{
unsigned long offset, end = start + nr_pages;
unsigned char *map = si->swap_map;
- bool need_reclaim = false;
+
+ if (cluster_is_empty(ci))
+ return true;
for (offset = start; offset < end; offset++) {
switch (READ_ONCE(map[offset])) {
@@ -651,16 +732,13 @@ static bool cluster_scan_range(struct swap_info_struct *si,
case SWAP_HAS_CACHE:
if (!vm_swap_full())
return false;
- need_reclaim = true;
+ *need_reclaim = true;
continue;
default:
return false;
}
}
- if (need_reclaim)
- return cluster_reclaim_range(si, ci, start, end);
-
return true;
}
@@ -670,76 +748,83 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
{
unsigned int nr_pages = 1 << order;
+ lockdep_assert_held(&ci->lock);
+
if (!(si->flags & SWP_WRITEOK))
return false;
- if (cluster_is_free(ci)) {
- if (nr_pages < SWAPFILE_CLUSTER) {
- list_move_tail(&ci->list, &si->nonfull_clusters[order]);
- ci->flags = CLUSTER_FLAG_NONFULL;
- }
+ /*
+ * The first allocation in a cluster makes the
+ * cluster exclusive to this order
+ */
+ if (cluster_is_empty(ci))
ci->order = order;
- }
memset(si->swap_map + start, usage, nr_pages);
- swap_range_alloc(si, start, nr_pages);
+ swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
- if (ci->count == SWAPFILE_CLUSTER) {
- VM_BUG_ON(!(ci->flags &
- (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
- list_move_tail(&ci->list, &si->full_clusters);
- ci->flags = CLUSTER_FLAG_FULL;
- }
-
return true;
}
-static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
- unsigned int *foundp, unsigned int order,
+/* Try use a new cluster for current CPU and allocate from it. */
+static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset,
+ unsigned int order,
unsigned char usage)
{
- unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
+ unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
+ unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
unsigned int nr_pages = 1 << order;
- struct swap_cluster_info *ci;
+ bool need_reclaim, ret;
- if (end < nr_pages)
- return SWAP_NEXT_INVALID;
- end -= nr_pages;
+ lockdep_assert_held(&ci->lock);
- ci = lock_cluster(si, offset);
- if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
+ if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
+ goto out;
- while (offset <= end) {
- if (cluster_scan_range(si, ci, offset, nr_pages)) {
- if (!cluster_alloc_range(si, ci, offset, usage, order)) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
- *foundp = offset;
- if (ci->count == SWAPFILE_CLUSTER) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
- offset += nr_pages;
- break;
+ for (end -= nr_pages; offset <= end; offset += nr_pages) {
+ need_reclaim = false;
+ if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
+ continue;
+ if (need_reclaim) {
+ ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages);
+ /*
+ * Reclaim drops ci->lock and cluster could be used
+ * by another order. Not checking flag as off-list
+ * cluster has no flag set, and change of list
+ * won't cause fragmentation.
+ */
+ if (!cluster_is_usable(ci, order))
+ goto out;
+ if (cluster_is_empty(ci))
+ offset = start;
+ /* Reclaim failed but cluster is usable, try next */
+ if (!ret)
+ continue;
}
+ if (!cluster_alloc_range(si, ci, offset, usage, order))
+ break;
+ found = offset;
offset += nr_pages;
+ if (ci->count < SWAPFILE_CLUSTER && offset <= end)
+ next = offset;
+ break;
}
- if (offset > end)
- offset = SWAP_NEXT_INVALID;
-done:
+out:
+ relocate_cluster(si, ci);
unlock_cluster(ci);
- return offset;
+ if (si->flags & SWP_SOLIDSTATE) {
+ this_cpu_write(percpu_swap_cluster.offset[order], next);
+ this_cpu_write(percpu_swap_cluster.si[order], si);
+ } else {
+ si->global_cluster->next[order] = next;
+ }
+ return found;
}
-/* Return true if reclaimed a whole cluster */
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -749,20 +834,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
int nr_reclaim;
if (force)
- to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+ to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while (!list_empty(&si->full_clusters)) {
- ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
- list_move_tail(&ci->list, &si->full_clusters);
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
- spin_unlock(&si->lock);
while (offset < end) {
if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+ spin_unlock(&ci->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
- TTRS_ANYWAY | TTRS_DIRECT);
+ TTRS_ANYWAY);
+ spin_lock(&ci->lock);
if (nr_reclaim) {
offset += abs(nr_reclaim);
continue;
@@ -770,8 +854,12 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
}
offset++;
}
- spin_lock(&si->lock);
+ /* in case no swap cache is reclaimed */
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ relocate_cluster(si, ci);
+
+ unlock_cluster(ci);
if (to_scan <= 0)
break;
}
@@ -783,42 +871,54 @@ static void swap_reclaim_work(struct work_struct *work)
si = container_of(work, struct swap_info_struct, reclaim_work);
- spin_lock(&si->lock);
swap_reclaim_full_clusters(si, true);
- spin_unlock(&si->lock);
}
/*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
*/
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned char usage)
{
- struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- unsigned int offset, found = 0;
+ unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
-new_cluster:
- lockdep_assert_held(&si->lock);
- cluster = this_cpu_ptr(si->percpu_cluster);
- offset = cluster->next[order];
- if (offset) {
- offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
+ /*
+ * Swapfile is not block device so unable
+ * to allocate large entries.
+ */
+ if (order && !(si->flags & SWP_BLKDEV))
+ return 0;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ /* Serialize HDD SWAP allocation for each device. */
+ spin_lock(&si->global_cluster_lock);
+ offset = si->global_cluster->next[order];
+ if (offset == SWAP_ENTRY_INVALID)
+ goto new_cluster;
+
+ ci = lock_cluster(si, offset);
+ /* Cluster could have been used by another order */
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset,
+ order, usage);
+ } else {
+ unlock_cluster(ci);
+ }
if (found)
goto done;
}
- if (!list_empty(&si->free_clusters)) {
- ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
- /*
- * Either we didn't touch the cluster due to swapoff,
- * or the allocation must success.
- */
- VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
- goto done;
+new_cluster:
+ ci = isolate_lock_cluster(si, &si->free_clusters);
+ if (ci) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
+ if (found)
+ goto done;
}
/* Try reclaim from full clusters if free clusters list is drained */
@@ -826,56 +926,41 @@ new_cluster:
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0;
-
- while (!list_empty(&si->nonfull_clusters[order])) {
- ci = list_first_entry(&si->nonfull_clusters[order],
- struct swap_cluster_info, list);
- list_move_tail(&ci->list, &si->frag_clusters[order]);
- ci->flags = CLUSTER_FLAG_FRAG;
- si->frag_cluster_nr[order]++;
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, order, usage);
- frags++;
+ unsigned int frags = 0, frags_existing;
+
+ while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
if (found)
- break;
+ goto done;
+ /* Clusters failed to allocate are moved to frag_clusters */
+ frags++;
}
- if (!found) {
+ frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
+ while (frags < frags_existing &&
+ (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
+ atomic_long_dec(&si->frag_cluster_nr[order]);
/*
- * Nonfull clusters are moved to frag tail if we reached
- * here, count them too, don't over scan the frag list.
+ * Rotate the frag list to iterate, they were all
+ * failing high order allocation or moved here due to
+ * per-CPU usage, but they could contain newly released
+ * reclaimable (eg. lazy-freed swap cache) slots.
*/
- while (frags < si->frag_cluster_nr[order]) {
- ci = list_first_entry(&si->frag_clusters[order],
- struct swap_cluster_info, list);
- /*
- * Rotate the frag list to iterate, they were all failing
- * high order allocation or moved here due to per-CPU usage,
- * this help keeping usable cluster ahead.
- */
- list_move_tail(&ci->list, &si->frag_clusters[order]);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, order, usage);
- frags++;
- if (found)
- break;
- }
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
+ if (found)
+ goto done;
+ frags++;
}
}
- if (found)
- goto done;
-
- if (!list_empty(&si->discard_clusters)) {
- /*
- * we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them, then
- * reread cluster_next_cpu since we dropped si->lock
- */
- swap_do_scheduled_discard(si);
+ /*
+ * We don't have free cluster but have some clusters in discarding,
+ * do discard now and reclaim them.
+ */
+ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
goto new_cluster;
- }
if (order)
goto done;
@@ -886,74 +971,150 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while (!list_empty(&si->frag_clusters[o])) {
- ci = list_first_entry(&si->frag_clusters[o],
- struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, 0, usage);
+ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
+ atomic_long_dec(&si->frag_cluster_nr[o]);
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ 0, usage);
if (found)
goto done;
}
- while (!list_empty(&si->nonfull_clusters[o])) {
- ci = list_first_entry(&si->nonfull_clusters[o],
- struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, 0, usage);
+ while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ 0, usage);
if (found)
goto done;
}
}
-
done:
- cluster->next[order] = offset;
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_unlock(&si->global_cluster_lock);
return found;
}
-static void __del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
+static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
int nid;
+ unsigned long pages;
+
+ spin_lock(&swap_avail_lock);
+
+ if (swapoff) {
+ /*
+ * Forcefully remove it. Clear the SWP_WRITEOK flags for
+ * swapoff here so it's synchronized by both si->lock and
+ * swap_avail_lock, to ensure the result can be seen by
+ * add_to_avail_list.
+ */
+ lockdep_assert_held(&si->lock);
+ si->flags &= ~SWP_WRITEOK;
+ atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+ } else {
+ /*
+ * If not called by swapoff, take it off-list only if it's
+ * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
+ * si->inuse_pages == pages), any concurrent slot freeing,
+ * or device already removed from plist by someone else
+ * will make this return false.
+ */
+ pages = si->pages;
+ if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
+ pages | SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+ }
- assert_spin_locked(&si->lock);
for_each_node(nid)
plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
+ spin_unlock(&swap_avail_lock);
}
-static void del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
+static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
+ int nid;
+ long val;
+ unsigned long pages;
+
spin_lock(&swap_avail_lock);
- __del_from_avail_list(si);
+
+ /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
+ if (swapon) {
+ lockdep_assert_held(&si->lock);
+ si->flags |= SWP_WRITEOK;
+ } else {
+ if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
+ goto skip;
+ }
+
+ if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+
+ val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+
+ /*
+ * When device is full and device is on the plist, only one updater will
+ * see (inuse_pages == si->pages) and will call del_from_avail_list. If
+ * that updater happen to be here, just skip adding.
+ */
+ pages = si->pages;
+ if (val == pages) {
+ /* Just like the cmpxchg in del_from_avail_list */
+ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
+ pages | SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+ }
+
+ for_each_node(nid)
+ plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
spin_unlock(&swap_avail_lock);
}
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
- unsigned int nr_entries)
+/*
+ * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
+ * within each cluster, so the total contribution to the global counter should
+ * always be positive and cannot exceed the total number of usable slots.
+ */
+static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
{
- unsigned int end = offset + nr_entries - 1;
+ long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
- if (offset == si->lowest_bit)
- si->lowest_bit += nr_entries;
- if (end == si->highest_bit)
- WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- del_from_avail_list(si);
-
- if (si->cluster_info && vm_swap_full())
- schedule_work(&si->reclaim_work);
+ /*
+ * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
+ * remove it from the plist.
+ */
+ if (unlikely(val == si->pages)) {
+ del_from_avail_list(si, false);
+ return true;
}
+
+ return false;
}
-static void add_to_avail_list(struct swap_info_struct *si)
+static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
{
- int nid;
+ long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
- spin_lock(&swap_avail_lock);
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
- spin_unlock(&swap_avail_lock);
+ /*
+ * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
+ * add it to the plist.
+ */
+ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
+ add_to_avail_list(si, false);
+}
+
+static void swap_range_alloc(struct swap_info_struct *si,
+ unsigned int nr_entries)
+{
+ if (swap_usage_add(si, nr_entries)) {
+ if (vm_swap_full())
+ schedule_work(&si->reclaim_work);
+ }
+ atomic_long_sub(nr_entries, &nr_swap_pages);
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -968,18 +1129,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
* Use atomic clear_bit operations only on zeromap instead of non-atomic
* bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
clear_bit(offset + i, si->zeromap);
-
- if (offset < si->lowest_bit)
- si->lowest_bit = offset;
- if (end > si->highest_bit) {
- bool was_full = !si->highest_bit;
-
- WRITE_ONCE(si->highest_bit, end);
- if (was_full && (si->flags & SWP_WRITEOK))
- add_to_avail_list(si);
+ zswap_invalidate(swp_entry(si->type, offset + i));
}
+
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -999,324 +1153,87 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
*/
smp_wmb();
atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
+ swap_usage_sub(si, nr_entries);
}
-static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+static bool get_swap_device_info(struct swap_info_struct *si)
{
- unsigned long prev;
-
- if (!(si->flags & SWP_SOLIDSTATE)) {
- si->cluster_next = next;
- return;
- }
-
- prev = this_cpu_read(*si->cluster_next_cpu);
+ if (!percpu_ref_tryget_live(&si->users))
+ return false;
/*
- * Cross the swap address space size aligned trunk, choose
- * another trunk randomly to avoid lock contention on swap
- * address space if possible.
+ * Guarantee the si->users are checked before accessing other
+ * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
+ * up to dated.
+ *
+ * Paired with the spin_unlock() after setup_swap_info() in
+ * enable_swap_info(), and smp_wmb() in swapoff.
*/
- if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
- (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
- /* No free swap slots available */
- if (si->highest_bit <= si->lowest_bit)
- return;
- next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
- next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
- next = max_t(unsigned int, next, si->lowest_bit);
- }
- this_cpu_write(*si->cluster_next_cpu, next);
-}
-
-static bool swap_offset_available_and_locked(struct swap_info_struct *si,
- unsigned long offset)
-{
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- return true;
- }
-
- if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- return true;
- }
-
- return false;
-}
-
-static int cluster_alloc_swap(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
-{
- int n_ret = 0;
-
- VM_BUG_ON(!si->cluster_info);
-
- si->flags += SWP_SCANNING;
-
- while (n_ret < nr) {
- unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
-
- if (!offset)
- break;
- slots[n_ret++] = swp_entry(si->type, offset);
- }
-
- si->flags -= SWP_SCANNING;
-
- return n_ret;
+ smp_rmb();
+ return true;
}
-static int scan_swap_map_slots(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static bool swap_alloc_fast(swp_entry_t *entry,
+ int order)
{
- unsigned long offset;
- unsigned long scan_base;
- unsigned long last_in_cluster = 0;
- int latency_ration = LATENCY_LIMIT;
- unsigned int nr_pages = 1 << order;
- int n_ret = 0;
- bool scanned_many = false;
-
- /*
- * We try to cluster swap pages by allocating them sequentially
- * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
- * way, however, we resort to first-free allocation, starting
- * a new cluster. This prevents us from scattering swap pages
- * all over the entire swap partition, so that we reduce
- * overall disk seek times between swap pages. -- sct
- * But we do now try to find an empty cluster. -Andrea
- * And we let swap pages go all over an SSD partition. Hugh
- */
-
- if (order > 0) {
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (!IS_ENABLED(CONFIG_THP_SWAP) ||
- nr_pages > SWAPFILE_CLUSTER) {
- VM_WARN_ON_ONCE(1);
- return 0;
- }
-
- /*
- * Swapfile is not block device or not using clusters so unable
- * to allocate large entries.
- */
- if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
- return 0;
- }
-
- if (si->cluster_info)
- return cluster_alloc_swap(si, usage, nr, slots, order);
-
- si->flags += SWP_SCANNING;
-
- /* For HDD, sequential access is more important. */
- scan_base = si->cluster_next;
- offset = scan_base;
-
- if (unlikely(!si->cluster_nr--)) {
- if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
-
- spin_unlock(&si->lock);
-
- /*
- * If seek is expensive, start searching for new cluster from
- * start of partition, to minimize the span of allocated swap.
- */
- scan_base = offset = si->lowest_bit;
- last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-
- /* Locate the first empty (unaligned) cluster */
- for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) {
- if (si->swap_map[offset])
- last_in_cluster = offset + SWAPFILE_CLUSTER;
- else if (offset == last_in_cluster) {
- spin_lock(&si->lock);
- offset -= SWAPFILE_CLUSTER - 1;
- si->cluster_next = offset;
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- }
- }
-
- offset = scan_base;
- spin_lock(&si->lock);
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- }
-
-checks:
- if (!(si->flags & SWP_WRITEOK))
- goto no_page;
- if (!si->highest_bit)
- goto no_page;
- if (offset > si->highest_bit)
- scan_base = offset = si->lowest_bit;
-
- /* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- int swap_was_freed;
- spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
- spin_lock(&si->lock);
- /* entry was freed successfully, try to use this again */
- if (swap_was_freed > 0)
- goto checks;
- goto scan; /* check next one */
- }
-
- if (si->swap_map[offset]) {
- if (!n_ret)
- goto scan;
- else
- goto done;
- }
- memset(si->swap_map + offset, usage, nr_pages);
-
- swap_range_alloc(si, offset, nr_pages);
- slots[n_ret++] = swp_entry(si->type, offset);
-
- /* got enough slots or reach max slots? */
- if ((n_ret == nr) || (offset >= si->highest_bit))
- goto done;
-
- /* search for next available slot */
-
- /* time to take a break? */
- if (unlikely(--latency_ration < 0)) {
- if (n_ret)
- goto done;
- spin_unlock(&si->lock);
- cond_resched();
- spin_lock(&si->lock);
- latency_ration = LATENCY_LIMIT;
- }
-
- if (si->cluster_nr && !si->swap_map[++offset]) {
- /* non-ssd case, still more slots in cluster? */
- --si->cluster_nr;
- goto checks;
- }
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned int offset, found = SWAP_ENTRY_INVALID;
/*
- * Even if there's no free clusters available (fragmented),
- * try to scan a little more quickly with lock held unless we
- * have scanned too many slots already.
+ * Once allocated, swap_info_struct will never be completely freed,
+ * so checking it's liveness by get_swap_device_info is enough.
*/
- if (!scanned_many) {
- unsigned long scan_limit;
-
- if (offset < scan_base)
- scan_limit = scan_base;
- else
- scan_limit = si->highest_bit;
- for (; offset <= scan_limit && --latency_ration > 0;
- offset++) {
- if (!si->swap_map[offset])
- goto checks;
- }
- }
-
-done:
- if (order == 0)
- set_cluster_next(si, offset + 1);
- si->flags -= SWP_SCANNING;
- return n_ret;
+ si = this_cpu_read(percpu_swap_cluster.si[order]);
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ if (!si || !offset || !get_swap_device_info(si))
+ return false;
-scan:
- VM_WARN_ON(order > 0);
- spin_unlock(&si->lock);
- while (++offset <= READ_ONCE(si->highest_bit)) {
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- scanned_many = true;
- }
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
- }
- offset = si->lowest_bit;
- while (offset < scan_base) {
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- scanned_many = true;
- }
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
- offset++;
+ ci = lock_cluster(si, offset);
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
+ if (found)
+ *entry = swp_entry(si->type, found);
+ } else {
+ unlock_cluster(ci);
}
- spin_lock(&si->lock);
-no_page:
- si->flags -= SWP_SCANNING;
- return n_ret;
+ put_swap_device(si);
+ return !!found;
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
+/* Rotate the device and switch to a new cluster */
+static bool swap_alloc_slow(swp_entry_t *entry,
+ int order)
{
- int order = swap_entry_order(entry_order);
- unsigned long size = 1 << order;
- struct swap_info_struct *si, *next;
- long avail_pgs;
- int n_ret = 0;
int node;
+ unsigned long offset;
+ struct swap_info_struct *si, *next;
+ node = numa_node_id();
spin_lock(&swap_avail_lock);
-
- avail_pgs = atomic_long_read(&nr_swap_pages) / size;
- if (avail_pgs <= 0) {
- spin_unlock(&swap_avail_lock);
- goto noswap;
- }
-
- n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
-
- atomic_long_sub(n_goal * size, &nr_swap_pages);
-
start_over:
- node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
- /* requeue si to after same-priority siblings */
+ /* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
- spin_lock(&si->lock);
- if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_lists[node])) {
- spin_unlock(&si->lock);
- goto nextsi;
+ if (get_swap_device_info(si)) {
+ offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+ put_swap_device(si);
+ if (offset) {
+ *entry = swp_entry(si->type, offset);
+ return true;
}
- WARN(!si->highest_bit,
- "swap_info %d in list but !highest_bit\n",
- si->type);
- WARN(!(si->flags & SWP_WRITEOK),
- "swap_info %d in list but !SWP_WRITEOK\n",
- si->type);
- __del_from_avail_list(si);
- spin_unlock(&si->lock);
- goto nextsi;
+ if (order)
+ return false;
}
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries, order);
- spin_unlock(&si->lock);
- if (n_ret || size > 1)
- goto check_out;
- cond_resched();
spin_lock(&swap_avail_lock);
-nextsi:
/*
* if we got here, it's likely that si was almost full before,
* and since scan_swap_map_slots() can drop the si->lock,
@@ -1331,15 +1248,76 @@ nextsi:
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
-
spin_unlock(&swap_avail_lock);
+ return false;
+}
+
+/**
+ * folio_alloc_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
+ * @gfp: gfp mask for shadow nodes
+ *
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
+ *
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
+ */
+int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+{
+ unsigned int order = folio_order(folio);
+ unsigned int size = 1 << order;
+ swp_entry_t entry = {};
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
+
+ if (order) {
+ /*
+ * Reject large allocation when THP_SWAP is disabled,
+ * the caller should split the folio and try again.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP))
+ return -EAGAIN;
+
+ /*
+ * Allocation size should never exceed cluster size
+ * (HPAGE_PMD_SIZE).
+ */
+ if (size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
-check_out:
- if (n_ret < n_goal)
- atomic_long_add((long)(n_goal - n_ret) * size,
- &nr_swap_pages);
-noswap:
- return n_ret;
+ local_lock(&percpu_swap_cluster.lock);
+ if (!swap_alloc_fast(&entry, order))
+ swap_alloc_slow(&entry, order);
+ local_unlock(&percpu_swap_cluster.lock);
+
+ /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
+ if (mem_cgroup_try_charge_swap(folio, entry))
+ goto out_free;
+
+ if (!entry.val)
+ return -ENOMEM;
+
+ /*
+ * XArray node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
+ goto out_free;
+
+ return 0;
+
+out_free:
+ put_swap_folio(folio, entry);
+ return -ENOMEM;
}
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
@@ -1376,26 +1354,12 @@ out:
return NULL;
}
-static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
- struct swap_info_struct *q)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
-
- if (p != q) {
- if (q != NULL)
- spin_unlock(&q->lock);
- if (p != NULL)
- spin_lock(&p->lock);
- }
- return p;
-}
-
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
- unsigned long offset,
- unsigned char usage)
+static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry,
+ unsigned char usage)
{
+ unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
@@ -1427,7 +1391,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
if (usage)
WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
+ swap_entries_free(si, ci, entry, 1);
return usage;
}
@@ -1481,16 +1445,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
- if (!percpu_ref_tryget_live(&si->users))
+ if (!get_swap_device_info(si))
goto out;
- /*
- * Guarantee the si->users are checked before accessing other
- * fields of swap_info_struct.
- *
- * Paired with the spin_unlock() after setup_swap_info() in
- * enable_swap_info().
- */
- smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
goto put_out;
@@ -1506,121 +1462,127 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *si,
- swp_entry_t entry)
+static void swap_entries_put_cache(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
-
- ci = lock_cluster_or_swap_info(si, offset);
- usage = __swap_entry_free_locked(si, offset, 1);
- unlock_cluster_or_swap_info(si, ci);
- if (!usage)
- free_swap_slot(entry);
+ struct swap_cluster_info *ci;
- return usage;
+ ci = lock_cluster(si, offset);
+ if (swap_only_has_cache(si, offset, nr))
+ swap_entries_free(si, ci, entry, nr);
+ else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
+ unlock_cluster(ci);
}
-static bool __swap_entries_free(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
+static bool swap_entries_put_map(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
unsigned long offset = swp_offset(entry);
- unsigned int type = swp_type(entry);
struct swap_cluster_info *ci;
bool has_cache = false;
unsigned char count;
int i;
- if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ if (nr <= 1)
goto fallback;
- /* cross into another cluster */
- if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ count = swap_count(data_race(si->swap_map[offset]));
+ if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster_or_swap_info(si, ci);
- goto fallback;
+ goto locked_fallback;
}
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
- unlock_cluster_or_swap_info(si, ci);
-
- if (!has_cache) {
+ if (!has_cache)
+ swap_entries_free(si, ci, entry, nr);
+ else
for (i = 0; i < nr; i++)
- zswap_invalidate(swp_entry(si->type, offset + i));
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, nr);
- spin_unlock(&si->lock);
- }
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
+ unlock_cluster(ci);
+
return has_cache;
fallback:
- for (i = 0; i < nr; i++) {
- if (data_race(si->swap_map[offset + i])) {
- count = __swap_entry_free(si, swp_entry(type, offset + i));
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
+ ci = lock_cluster(si, offset);
+locked_fallback:
+ for (i = 0; i < nr; i++, entry.val++) {
+ count = swap_entry_put_locked(si, ci, entry, 1);
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
}
+ unlock_cluster(ci);
return has_cache;
+
}
/*
- * Drop the last HAS_CACHE flag of swap entries, caller have to
- * ensure all entries belong to the same cgroup.
+ * Only functions with "_nr" suffix are able to free entries spanning
+ * cross multi clusters, so ensure the range is within a single cluster
+ * when freeing entries with functions without "_nr" suffix.
*/
-static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
- unsigned int nr_pages)
+static bool swap_entries_put_map_nr(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
+{
+ int cluster_nr, cluster_rest;
+ unsigned long offset = swp_offset(entry);
+ bool has_cache = false;
+
+ cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
+ while (nr) {
+ cluster_nr = min(nr, cluster_rest);
+ has_cache |= swap_entries_put_map(si, entry, cluster_nr);
+ cluster_rest = SWAPFILE_CLUSTER;
+ nr -= cluster_nr;
+ entry.val += cluster_nr;
+ }
+
+ return has_cache;
+}
+
+/*
+ * Check if it's the last ref of swap entry in the freeing path.
+ * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ */
+static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
+{
+ return (count == SWAP_HAS_CACHE) || (count == 1) ||
+ (count == SWAP_MAP_SHMEM);
+}
+
+/*
+ * Drop the last ref of swap entries, caller have to ensure all entries
+ * belong to the same cgroup and cluster.
+ */
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
unsigned char *map_end = map + nr_pages;
- struct swap_cluster_info *ci;
- ci = lock_cluster(si, offset);
+ /* It should never free entries across different clusters */
+ VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
+ VM_BUG_ON(cluster_is_empty(ci));
+ VM_BUG_ON(ci->count < nr_pages);
+
+ ci->count -= nr_pages;
do {
- VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
- dec_cluster_info_page(si, ci, nr_pages);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
-}
-
-static void cluster_swap_free_nr(struct swap_info_struct *si,
- unsigned long offset, int nr_pages,
- unsigned char usage)
-{
- struct swap_cluster_info *ci;
- DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 };
- int i, nr;
- ci = lock_cluster_or_swap_info(si, offset);
- while (nr_pages) {
- nr = min(BITS_PER_LONG, nr_pages);
- for (i = 0; i < nr; i++) {
- if (!__swap_entry_free_locked(si, offset + i, usage))
- bitmap_set(to_free, i, 1);
- }
- if (!bitmap_empty(to_free, BITS_PER_LONG)) {
- unlock_cluster_or_swap_info(si, ci);
- for_each_set_bit(i, to_free, BITS_PER_LONG)
- free_swap_slot(swp_entry(si->type, offset + i));
- if (nr == nr_pages)
- return;
- bitmap_clear(to_free, 0, BITS_PER_LONG);
- ci = lock_cluster_or_swap_info(si, offset);
- }
- offset += nr;
- nr_pages -= nr;
- }
- unlock_cluster_or_swap_info(si, ci);
+ if (!ci->count)
+ free_cluster(si, ci);
+ else
+ partial_free_cluster(si, ci);
}
/*
@@ -1639,7 +1601,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr, 1);
+ swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
offset += nr;
nr_pages -= nr;
}
@@ -1650,8 +1612,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
struct swap_info_struct *si;
int size = 1 << swap_entry_order(folio_order(folio));
@@ -1659,59 +1619,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster_or_swap_info(si, offset);
- if (size > 1 && swap_is_has_cache(si, offset, size)) {
- unlock_cluster_or_swap_info(si, ci);
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, size);
- spin_unlock(&si->lock);
- return;
- }
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
- unlock_cluster_or_swap_info(si, ci);
- free_swap_slot(entry);
- if (i == size - 1)
- return;
- lock_cluster_or_swap_info(si, offset);
- }
- }
- unlock_cluster_or_swap_info(si, ci);
-}
-
-static int swp_entry_cmp(const void *ent1, const void *ent2)
-{
- const swp_entry_t *e1 = ent1, *e2 = ent2;
-
- return (int)swp_type(*e1) - (int)swp_type(*e2);
-}
-
-void swapcache_free_entries(swp_entry_t *entries, int n)
-{
- struct swap_info_struct *p, *prev;
- int i;
-
- if (n <= 0)
- return;
-
- prev = NULL;
- p = NULL;
-
- /*
- * Sort swap entries by swap device, so each lock is only taken once.
- * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
- * so low that it isn't necessary to optimize further.
- */
- if (nr_swapfiles > 1)
- sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
- for (i = 0; i < n; ++i) {
- p = swap_info_get_cont(entries[i], prev);
- if (p)
- swap_entry_range_free(p, entries[i], 1);
- prev = p;
- }
- if (p)
- spin_unlock(&p->lock);
+ swap_entries_put_cache(si, entry, size);
}
int __swap_count(swp_entry_t entry)
@@ -1727,16 +1635,16 @@ int __swap_count(swp_entry_t entry)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
int count;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
- return count;
+ unlock_cluster(ci);
+ return !!count;
}
/*
@@ -1758,7 +1666,7 @@ int swp_swapcount(swp_entry_t entry)
offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
@@ -1781,7 +1689,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return count;
}
@@ -1796,8 +1704,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
int i;
bool ret = false;
- ci = lock_cluster_or_swap_info(si, offset);
- if (!ci || nr_pages == 1) {
+ ci = lock_cluster(si, offset);
+ if (nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
@@ -1809,7 +1717,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
}
}
unlock_out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return ret;
}
@@ -1822,7 +1730,7 @@ static bool folio_swapped(struct folio *folio)
return false;
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
- return swap_swapcount(si, entry) != 0;
+ return swap_entry_swapped(si, entry);
return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}
@@ -1896,9 +1804,6 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
bool any_only_cache = false;
unsigned long offset;
- if (non_swap_entry(entry))
- return;
-
si = get_swap_device(entry);
if (!si)
return;
@@ -1909,7 +1814,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- any_only_cache = __swap_entries_free(si, entry, nr);
+ any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1919,13 +1824,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
goto out;
/*
- * Now go back over the range trying to reclaim the swap cache. This is
- * more efficient for large folios because we will only try to reclaim
- * the swap once per folio in the common case. If we do
- * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
- * latter will get a reference and lock the folio for every individual
- * page but will only succeed once the swap slot for every subpage is
- * zero.
+ * Now go back over the range trying to reclaim the swap cache.
*/
for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1;
@@ -1957,16 +1856,21 @@ out:
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ unsigned long offset;
swp_entry_t entry = {0};
if (!si)
goto fail;
/* This is called for allocating swap entry, not cache */
- spin_lock(&si->lock);
- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
- atomic_long_dec(&nr_swap_pages);
- spin_unlock(&si->lock);
+ if (get_swap_device_info(si)) {
+ if (si->flags & SWP_WRITEOK) {
+ offset = cluster_alloc_swap_entry(si, 0, 1);
+ if (offset)
+ entry = swp_entry(si->type, offset);
+ }
+ put_swap_device(si);
+ }
fail:
return entry;
}
@@ -2057,7 +1961,7 @@ unsigned int count_swap_pages(int type, int free)
if (sis->flags & SWP_WRITEOK) {
n = sis->pages;
if (free)
- n -= sis->inuse_pages;
+ n -= swap_usage_in_pages(sis);
}
spin_unlock(&sis->lock);
}
@@ -2337,6 +2241,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
for_each_vma(vmi, vma) {
if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
ret = unuse_vma(vma, type);
@@ -2346,6 +2252,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
cond_resched();
}
+unlock:
mmap_read_unlock(mm);
return ret;
}
@@ -2392,7 +2299,7 @@ static int try_to_unuse(unsigned int type)
swp_entry_t entry;
unsigned int i;
- if (!READ_ONCE(si->inuse_pages))
+ if (!swap_usage_in_pages(si))
goto success;
retry:
@@ -2405,7 +2312,7 @@ retry:
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
@@ -2433,7 +2340,7 @@ retry:
mmput(prev_mm);
i = 0;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i)) != 0) {
@@ -2464,11 +2371,11 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
- if (READ_ONCE(si->inuse_pages)) {
+ if (swap_usage_in_pages(si)) {
if (!signal_pending(current))
goto retry;
return -EINTR;
@@ -2495,7 +2402,7 @@ static void drain_mmlist(void)
unsigned int type;
for (type = 0; type < nr_swapfiles; type++)
- if (swap_info[type]->inuse_pages)
+ if (swap_usage_in_pages(swap_info[type]))
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -2674,7 +2581,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
static void _enable_swap_info(struct swap_info_struct *si)
{
- si->flags |= SWP_WRITEOK;
atomic_long_add(si->pages, &nr_swap_pages);
total_swap_pages += si->pages;
@@ -2691,9 +2597,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
*/
plist_add(&si->list, &swap_active_head);
- /* add to available list iff swap device is not full */
- if (si->highest_bit)
- add_to_avail_list(si);
+ /* Add back to available list */
+ add_to_avail_list(si, true);
}
static void enable_swap_info(struct swap_info_struct *si, int prio,
@@ -2727,21 +2632,46 @@ static void reinsert_swap_info(struct swap_info_struct *si)
spin_unlock(&swap_lock);
}
-static bool __has_usable_swap(void)
+/*
+ * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
+ * see the updated flags, so there will be no more allocations.
+ */
+static void wait_for_allocation(struct swap_info_struct *si)
{
- return !plist_head_empty(&swap_active_head);
+ unsigned long offset;
+ unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
+ struct swap_cluster_info *ci;
+
+ BUG_ON(si->flags & SWP_WRITEOK);
+
+ for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
+ ci = lock_cluster(si, offset);
+ unlock_cluster(ci);
+ }
}
-bool has_usable_swap(void)
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_cluster(struct swap_info_struct *si)
{
- bool ret;
+ int cpu, i;
+ struct swap_info_struct **pcp_si;
- spin_lock(&swap_lock);
- ret = __has_usable_swap();
- spin_unlock(&swap_lock);
- return ret;
+ for_each_possible_cpu(cpu) {
+ pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
+ /*
+ * Invalidate the percpu swap cluster cache, si->users
+ * is dead, so no new user will point to it, just flush
+ * any existing user.
+ */
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cmpxchg(&pcp_si[i], si, NULL);
+ }
}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -2791,7 +2721,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out_dput;
}
spin_lock(&p->lock);
- del_from_avail_list(p);
+ del_from_avail_list(p, true);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
@@ -2809,11 +2739,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
- p->flags &= ~SWP_WRITEOK;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- disable_swap_slots_cache_lock();
+ wait_for_allocation(p);
set_current_oom_origin();
err = try_to_unuse(p->type);
@@ -2822,12 +2751,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
- reenable_swap_slots_cache_unlock();
goto out_dput;
}
- reenable_swap_slots_cache_unlock();
-
/*
* Wait for swap operations protected by get/put_swap_device()
* to complete. Because of synchronize_rcu() here, all swap
@@ -2842,6 +2768,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
flush_work(&p->discard_work);
flush_work(&p->reclaim_work);
+ flush_percpu_swap_cluster(p);
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
@@ -2855,16 +2782,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
- /* wait for anyone still in scan_swap_map_slots */
- p->highest_bit = 0; /* cuts scans short */
- while (p->flags >= SWP_SCANNING) {
- spin_unlock(&p->lock);
- spin_unlock(&swap_lock);
- schedule_timeout_uninterruptible(1);
- spin_lock(&swap_lock);
- spin_lock(&p->lock);
- }
-
swap_file = p->swap_file;
p->swap_file = NULL;
p->max = 0;
@@ -2879,10 +2796,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
arch_swap_invalidate_area(p->type);
zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
- free_percpu(p->cluster_next_cpu);
- p->cluster_next_cpu = NULL;
+ kfree(p->global_cluster);
+ p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
kvfree(cluster_info);
@@ -2992,7 +2907,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
bytes = K(si->pages);
- inuse = K(READ_ONCE(si->inuse_pages));
+ inuse = K(swap_usage_in_pages(si));
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -3109,6 +3024,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
init_completion(&p->comp);
return p;
@@ -3193,10 +3109,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return 0;
}
- si->lowest_bit = 1;
- si->cluster_next = 1;
- si->cluster_nr = 0;
-
maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
if (!last_page) {
@@ -3213,7 +3125,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
if ((unsigned int)maxpages == 0)
maxpages = UINT_MAX;
}
- si->highest_bit = maxpages - 1;
if (!maxpages)
return 0;
@@ -3230,61 +3141,47 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return maxpages;
}
-#define SWAP_CLUSTER_INFO_COLS \
- DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
-#define SWAP_CLUSTER_SPACE_COLS \
- DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
-#define SWAP_CLUSTER_COLS \
- max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-
-static int setup_swap_map_and_extents(struct swap_info_struct *si,
- union swap_header *swap_header,
- unsigned char *swap_map,
- unsigned long maxpages,
- sector_t *span)
+static int setup_swap_map(struct swap_info_struct *si,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ unsigned long maxpages)
{
- unsigned int nr_good_pages;
unsigned long i;
- int nr_extents;
-
- nr_good_pages = maxpages - 1; /* omit header page */
+ swap_map[0] = SWAP_MAP_BAD; /* omit header page */
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
- nr_good_pages--;
+ si->pages--;
}
}
- if (nr_good_pages) {
- swap_map[0] = SWAP_MAP_BAD;
- si->max = maxpages;
- si->pages = nr_good_pages;
- nr_extents = setup_swap_extents(si, span);
- if (nr_extents < 0)
- return nr_extents;
- nr_good_pages = si->pages;
- }
- if (!nr_good_pages) {
+ if (!si->pages) {
pr_warn("Empty swap-file\n");
return -EINVAL;
}
- return nr_extents;
+ return 0;
}
+#define SWAP_CLUSTER_INFO_COLS \
+ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS \
+ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS \
+ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
struct swap_cluster_info *cluster_info;
- unsigned long i, j, k, idx;
- int cpu, err = -ENOMEM;
+ unsigned long i, j, idx;
+ int err = -ENOMEM;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3293,38 +3190,31 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- si->cluster_next_cpu = alloc_percpu(unsigned int);
- if (!si->cluster_next_cpu)
- goto err_free;
-
- /* Random start position to help with wear leveling */
- for_each_possible_cpu(cpu)
- per_cpu(*si->cluster_next_cpu, cpu) =
- get_random_u32_inclusive(1, si->highest_bit);
-
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
-
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ si->global_cluster = kmalloc(sizeof(*si->global_cluster),
+ GFP_KERNEL);
+ if (!si->global_cluster)
+ goto err_free;
for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_NEXT_INVALID;
+ si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
+ spin_lock_init(&si->global_cluster_lock);
}
/*
* Mark unusable pages as unavailable. The clusters aren't
* marked free yet, so no list operations are involved yet.
*
- * See setup_swap_map_and_extents(): header page, bad pages,
+ * See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
inc_cluster_info_page(si, cluster_info, 0);
- for (i = 0; i < swap_header->info.nr_badpages; i++)
- inc_cluster_info_page(si, cluster_info,
- swap_header->info.badpages[i]);
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+
+ if (page_nr >= maxpages)
+ continue;
+ inc_cluster_info_page(si, cluster_info, page_nr);
+ }
for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
inc_cluster_info_page(si, cluster_info, i);
@@ -3335,15 +3225,14 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- si->frag_cluster_nr[i] = 0;
+ atomic_long_set(&si->frag_cluster_nr[i], 0);
}
/*
* Reduce false cache line sharing between cluster_info and
* sharing same address space.
*/
- for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
- j = (k + col) % SWAP_CLUSTER_COLS;
+ for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
struct swap_cluster_info *ci;
idx = i * SWAP_CLUSTER_COLS + j;
@@ -3437,6 +3326,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/*
+ * The swap subsystem needs a major overhaul to support this.
+ * It doesn't work yet so just disable it for now.
+ */
+ if (mapping_min_folio_order(mapping) > 0) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
* Read the swap header.
*/
if (!mapping->a_ops->read_folio) {
@@ -3456,6 +3354,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
+ si->max = maxpages;
+ si->pages = maxpages - 1;
+ nr_extents = setup_swap_extents(si, &span);
+ if (nr_extents < 0) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
+ if (si->pages != si->max - 1) {
+ pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ maxpages = si->max;
+
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
@@ -3467,12 +3380,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
- nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
- maxpages, &span);
- if (unlikely(nr_extents < 0)) {
- error = nr_extents;
+ error = setup_swap_map(si, swap_header, swap_map, maxpages);
+ if (error)
goto bad_swap_unlock_inode;
- }
/*
* Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
@@ -3493,18 +3403,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (si->bdev && bdev_nonrot(si->bdev)) {
si->flags |= SWP_SOLIDSTATE;
-
- cluster_info = setup_clusters(si, swap_header, maxpages);
- if (IS_ERR(cluster_info)) {
- error = PTR_ERR(cluster_info);
- cluster_info = NULL;
- goto bad_swap_unlock_inode;
- }
} else {
atomic_inc(&nr_rotate_swap);
inced_nr_rotate_swap = true;
}
+ cluster_info = setup_clusters(si, swap_header, maxpages);
+ if (IS_ERR(cluster_info)) {
+ error = PTR_ERR(cluster_info);
+ cluster_info = NULL;
+ goto bad_swap_unlock_inode;
+ }
+
if ((swap_flags & SWAP_FLAG_DISCARD) &&
si->bdev && bdev_max_discard_sectors(si->bdev)) {
/*
@@ -3558,8 +3468,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
- prio =
- (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ prio = swap_flags & SWAP_FLAG_PRIO_MASK;
enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
@@ -3583,10 +3492,8 @@ free_swap_address_space:
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
- free_percpu(si->percpu_cluster);
- si->percpu_cluster = NULL;
- free_percpu(si->cluster_next_cpu);
- si->cluster_next_cpu = NULL;
+ kfree(si->global_cluster);
+ si->global_cluster = NULL;
inode = NULL;
destroy_swap_extents(si);
swap_cgroup_swapoff(si->type);
@@ -3608,8 +3515,6 @@ out:
putname(name);
if (inode)
inode_unlock(inode);
- if (!error)
- enable_swap_slots_cache();
return error;
}
@@ -3623,7 +3528,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += READ_ONCE(si->inuse_pages);
+ nr_to_be_unused += swap_usage_in_pages(si);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -3636,7 +3541,6 @@ void si_swapinfo(struct sysinfo *val)
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
- * - swp_entry is migration entry -> EINVAL
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
@@ -3651,11 +3555,15 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
VM_WARN_ON(usage == 1 && nr > 1);
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
err = 0;
for (i = 0; i < nr; i++) {
@@ -3710,7 +3618,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
}
unlock_out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return err;
}
@@ -3752,11 +3660,13 @@ int swapcache_prepare(swp_entry_t entry, int nr)
return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
+/*
+ * Caller should ensure entries belong to the same folio so
+ * the entries won't span cross cluster boundary.
+ */
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- unsigned long offset = swp_offset(entry);
-
- cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
+ swap_entries_put_cache(si, entry, nr);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
@@ -3765,21 +3675,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
}
/*
- * out-of-line methods to avoid include hell.
- */
-struct address_space *swapcache_mapping(struct folio *folio)
-{
- return swp_swap_info(folio->swap)->swap_file->f_mapping;
-}
-EXPORT_SYMBOL_GPL(swapcache_mapping);
-
-pgoff_t __folio_swap_cache_index(struct folio *folio)
-{
- return swap_cache_index(folio->swap);
-}
-EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -3819,7 +3714,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
goto outer;
}
- spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3884,7 +3778,6 @@ out_unlock_cont:
spin_unlock(&si->cont_lock);
out:
unlock_cluster(ci);
- spin_unlock(&si->lock);
put_swap_device(si);
outer:
if (page)
@@ -3898,8 +3791,8 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
- * lock.
+ * Called while __swap_duplicate() or caller of swap_entry_put_locked()
+ * holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
pgoff_t offset, unsigned char count)
@@ -4004,6 +3897,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static bool __has_usable_swap(void)
+{
+ return !plist_head_empty(&swap_active_head);
+}
+
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
diff --git a/mm/truncate.c b/mm/truncate.c
index 7c304d2f0052..3c5a50ae3274 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -78,8 +78,22 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
if (dax_mapping(mapping)) {
for (i = j; i < nr; i++) {
- if (xa_is_value(fbatch->folios[i]))
+ if (xa_is_value(fbatch->folios[i])) {
+ /*
+ * File systems should already have called
+ * dax_break_layout_entry() to remove all DAX
+ * entries while holding a lock to prevent
+ * establishing new entries. Therefore we
+ * shouldn't find any here.
+ */
+ WARN_ON_ONCE(1);
+
+ /*
+ * Delete the mapping so truncate_pagecache()
+ * doesn't loop forever.
+ */
dax_delete_mapping_entry(mapping, indices[i]);
+ }
}
goto out;
}
@@ -163,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
return 0;
}
+static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at,
+ unsigned long min_order)
+{
+ enum ttu_flags ttu_flags =
+ TTU_SYNC |
+ TTU_SPLIT_HUGE_PMD |
+ TTU_IGNORE_MLOCK;
+ int ret;
+
+ ret = try_folio_split_to_order(folio, split_at, min_order);
+
+ /*
+ * If the split fails, unmap the folio, so it will be refaulted
+ * with PTEs to respect SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ if (ret && !shmem_mapping(folio->mapping)) {
+ try_to_unmap(folio, ttu_flags);
+ WARN_ON(folio_mapped(folio));
+ }
+
+ return ret;
+}
+
/*
* Handle partial folios. The folio may be entirely within the
* range if a split has raced with us. If not, we zero the part of the
@@ -177,20 +217,22 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
+ struct page *split_at, *split_at2;
+ unsigned int min_order;
if (pos < start)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -207,8 +249,44 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
- if (split_folio(folio) == 0)
+
+ min_order = mapping_min_folio_order(folio->mapping);
+ split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
+ if (!try_folio_split_or_unmap(folio, split_at, min_order)) {
+ /*
+ * try to split at offset + length to make sure folios within
+ * the range can be dropped, especially to avoid memory waste
+ * for shmem truncate
+ */
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
+
+ if (!folio_try_get(folio2))
+ goto no_split;
+
+ if (!folio_test_large(folio2))
+ goto out;
+
+ if (!folio_trylock(folio2))
+ goto out;
+
+ /* make sure folio2 is large and does not change its mapping */
+ if (folio_test_large(folio2) &&
+ folio2->mapping == folio->mapping)
+ try_folio_split_or_unmap(folio2, split_at2, min_order);
+
+ folio_unlock(folio2);
+out:
+ folio_put(folio2);
+no_split:
return true;
+ }
if (folio_test_dirty(folio))
return false;
truncate_inode_folio(folio->mapping, folio);
@@ -372,7 +450,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;
@@ -525,6 +603,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(invalidate_mapping_pages);
+static int folio_launder(struct address_space *mapping, struct folio *folio)
+{
+ if (!folio_test_dirty(folio))
+ return 0;
+ if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
+ return 0;
+ return mapping->a_ops->launder_folio(folio);
+}
+
/*
* This is like mapping_evict_folio(), except it ignores the folio's
* refcount. We do this because invalidate_inode_pages2() needs stronger
@@ -532,14 +619,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* shrink_folio_list() has a temp ref on them, or because they're transiently
* sitting in the folio_add_lru() caches.
*/
-static int invalidate_complete_folio2(struct address_space *mapping,
- struct folio *folio)
+int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
+ gfp_t gfp)
{
- if (folio->mapping != mapping)
- return 0;
+ int ret;
- if (!filemap_release_folio(folio, GFP_KERNEL))
- return 0;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
+ BUG_ON(folio_mapped(folio));
+
+ ret = folio_launder(mapping, folio);
+ if (ret)
+ return ret;
+ if (folio->mapping != mapping)
+ return -EBUSY;
+ if (!filemap_release_folio(folio, gfp))
+ return -EBUSY;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
@@ -558,16 +655,7 @@ static int invalidate_complete_folio2(struct address_space *mapping,
failed:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
- return 0;
-}
-
-static int folio_launder(struct address_space *mapping, struct folio *folio)
-{
- if (!folio_test_dirty(folio))
- return 0;
- if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
- return 0;
- return mapping->a_ops->launder_folio(folio);
+ return -EBUSY;
}
/**
@@ -631,16 +719,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
-
- if (folio_mapped(folio))
- unmap_mapping_folio(folio);
- BUG_ON(folio_mapped(folio));
-
- ret2 = folio_launder(mapping, folio);
- if (ret2 == 0) {
- if (!invalidate_complete_folio2(mapping, folio))
- ret2 = -EBUSY;
- }
+ ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
if (ret2 < 0)
ret = ret2;
folio_unlock(folio);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 83c164aba6e0..dbdcc43964fb 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,7 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
@@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
-static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+EXPORT_SYMBOL(validate_usercopy_range);
/*
* Validates that the given object is:
@@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- if (static_branch_unlikely(&bypass_usercopy_checks))
- return;
-
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
}
EXPORT_SYMBOL(__check_object_size);
-static bool enable_checks __initdata = true;
+static bool enable_checks __initdata =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);
static int __init parse_hardened_usercopy(char *str)
{
@@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy);
static int __init set_hardened_usercopy(void)
{
- if (enable_checks == false)
- static_branch_enable(&bypass_usercopy_checks);
+ if (enable_checks)
+ static_branch_enable(&validate_usercopy_range);
+ else
+ static_branch_disable(&validate_usercopy_range);
return 1;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 64e998f7d57c..aefdf3a812a1 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -85,14 +86,10 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
mmap_read_lock(mm);
vma = find_vma_and_prepare_anon(mm, address);
if (!IS_ERR(vma)) {
- /*
- * We cannot use vma_start_read() as it may fail due to
- * false locked (see comment in vma_start_read()). We
- * can avoid that by directly locking vm_lock under
- * mmap_lock, which guarantees that nobody can lock the
- * vma for write (vma_start_write()) under us.
- */
- down_read(&vma->vm_lock->lock);
+ bool locked = vma_start_read_locked(vma);
+
+ if (!locked)
+ vma = ERR_PTR(-EAGAIN);
}
mmap_read_unlock(mm);
@@ -564,7 +561,7 @@ retry:
}
while (src_addr < src_start + len) {
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
/*
* Serialize via vma_lock and hugetlb_fault_mutex.
@@ -605,7 +602,7 @@ retry:
if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma);
- BUG_ON(!folio);
+ VM_WARN_ON_ONCE(!folio);
err = copy_folio_from_user(folio,
(const void __user *)src_addr, true);
@@ -617,7 +614,7 @@ retry:
dst_vma = NULL;
goto retry;
} else
- BUG_ON(folio);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += vma_hpagesize;
@@ -638,9 +635,9 @@ out_unlock_vma:
out:
if (folio)
folio_put(folio);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
@@ -714,12 +711,12 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
/*
* Sanitize the command parameters:
*/
- BUG_ON(dst_start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(src_start + len <= src_start);
- BUG_ON(dst_start + len <= dst_start);
+ VM_WARN_ON_ONCE(src_start + len <= src_start);
+ VM_WARN_ON_ONCE(dst_start + len <= dst_start);
src_addr = src_start;
dst_addr = dst_start;
@@ -778,7 +775,7 @@ retry:
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
if (unlikely(!dst_pmd)) {
@@ -798,8 +795,8 @@ retry:
* (This includes the case where the PMD used to be THP and
* changed back to none after __pte_alloc().)
*/
- if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
- pmd_devmap(dst_pmdval))) {
+ if (unlikely(!pmd_present(dst_pmdval) ||
+ pmd_trans_huge(dst_pmdval))) {
err = -EEXIST;
break;
}
@@ -821,7 +818,7 @@ retry:
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma);
- BUG_ON(!folio);
+ VM_WARN_ON_ONCE(!folio);
kaddr = kmap_local_folio(folio, 0);
err = copy_from_user(kaddr,
@@ -835,7 +832,7 @@ retry:
flush_dcache_folio(folio);
goto retry;
} else
- BUG_ON(folio);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += PAGE_SIZE;
@@ -855,9 +852,9 @@ out_unlock:
out:
if (folio)
folio_put(folio);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
@@ -943,11 +940,11 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
/*
* Sanitize the command parameters:
*/
- BUG_ON(start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(start + len <= start);
+ VM_WARN_ON_ONCE(start + len <= start);
mmap_read_lock(dst_mm);
@@ -1020,6 +1017,14 @@ void double_pt_unlock(spinlock_t *ptl1,
__release(ptl2);
}
+static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval)
+{
+ return pte_same(ptep_get(src_pte), orig_src_pte) &&
+ pte_same(ptep_get(dst_pte), orig_dst_pte) &&
+ pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
+}
static int move_present_pte(struct mm_struct *mm,
struct vm_area_struct *dst_vma,
@@ -1027,6 +1032,7 @@ static int move_present_pte(struct mm_struct *mm,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
struct folio *src_folio)
{
@@ -1034,8 +1040,8 @@ static int move_present_pte(struct mm_struct *mm,
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
err = -EAGAIN;
goto out;
}
@@ -1057,9 +1063,14 @@ static int move_present_pte(struct mm_struct *mm,
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1067,24 +1078,65 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1097,13 +1149,14 @@ static int move_zeropage_pte(struct mm_struct *mm,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl)
{
pte_t zero_pte;
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
@@ -1130,12 +1183,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
pmd_t dummy_pmdval;
+ pmd_t dst_pmdval;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
@@ -1148,11 +1203,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
retry:
/*
* Use the maywrite version to indicate that dst_pte will be modified,
- * but since we will use pte_same() to detect the change of the pte
- * entry, there is no need to get pmdval, so just pass a dummy variable
- * to it.
+ * since dst_pte needs to be none, the subsequent pte_same() check
+ * cannot prevent the dst_pte page from being freed concurrently, so we
+ * also need to abtain dst_pmdval and recheck pmd_same() later.
*/
- dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval,
+ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
&dst_ptl);
/* Retry if a huge pmd materialized from under us */
@@ -1161,7 +1216,11 @@ retry:
goto out;
}
- /* same as dst_pte */
+ /*
+ * Unlike dst_pte, the subsequent pte_same() check can ensure the
+ * stability of the src_pte page, so there is no need to get pmdval,
+ * just pass a dummy variable to it.
+ */
src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
&src_ptl);
@@ -1177,8 +1236,8 @@ retry:
}
/* Sanity checks before the operation */
- if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) ||
- WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
+ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
+ pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
err = -EINVAL;
goto out;
}
@@ -1213,7 +1272,7 @@ retry:
err = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ dst_pmd, dst_pmdval, dst_ptl, src_ptl);
goto out;
}
@@ -1264,8 +1323,8 @@ retry:
spin_unlock(src_ptl);
if (!locked) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1281,8 +1340,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1307,8 +1366,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1318,14 +1377,16 @@ retry:
err = move_present_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl, src_folio);
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1334,10 +1395,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr,
- dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
@@ -1349,11 +1453,18 @@ out:
folio_unlock(src_folio);
folio_put(src_folio);
}
- if (dst_pte)
- pte_unmap(dst_pte);
+ /*
+ * Unmap in reverse order (LIFO) to maintain proper kmap_local
+ * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
+ * first, then src_pte, so we must unmap src_pte first, then dst_pte.
+ */
if (src_pte)
pte_unmap(src_pte);
+ if (dst_pte)
+ pte_unmap(dst_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
@@ -1490,16 +1601,24 @@ static int uffd_move_lock(struct mm_struct *mm,
mmap_read_lock(mm);
err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
- if (!err) {
- /*
- * See comment in uffd_lock_vma() as to why not using
- * vma_start_read() here.
- */
- down_read(&(*dst_vmap)->vm_lock->lock);
- if (*dst_vmap != *src_vmap)
- down_read_nested(&(*src_vmap)->vm_lock->lock,
- SINGLE_DEPTH_NESTING);
+ if (err)
+ goto out;
+
+ if (!vma_start_read_locked(*dst_vmap)) {
+ err = -EAGAIN;
+ goto out;
}
+
+ /* Nothing further to do if both vmas are locked. */
+ if (*dst_vmap == *src_vmap)
+ goto out;
+
+ if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
+ /* Undo dst_vmap locking if src_vmap failed to lock */
+ vma_end_read(*dst_vmap);
+ err = -EAGAIN;
+ }
+out:
mmap_read_unlock(mm);
return err;
}
@@ -1624,15 +1743,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
ssize_t moved = 0;
/* Sanitize the command parameters. */
- if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
- WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
- WARN_ON_ONCE(len & ~PAGE_MASK))
- goto out;
+ VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- if (WARN_ON_ONCE(src_start + len <= src_start) ||
- WARN_ON_ONCE(dst_start + len <= dst_start))
- goto out;
+ VM_WARN_ON_ONCE(src_start + len < src_start);
+ VM_WARN_ON_ONCE(dst_start + len < dst_start);
err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
if (err)
@@ -1706,22 +1823,19 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
ptl = pmd_trans_huge_lock(src_pmd, src_vma);
if (ptl) {
- if (pmd_devmap(*src_pmd)) {
- spin_unlock(ptl);
- err = -ENOENT;
- break;
- }
-
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
- struct folio *folio = pmd_folio(*src_pmd);
-
- if (!folio || (!is_huge_zero_folio(folio) &&
- !PageAnonExclusive(&folio->page))) {
- spin_unlock(ptl);
- err = -EBUSY;
- break;
+ /* Can be a migration entry */
+ if (pmd_present(*src_pmd)) {
+ struct folio *folio = pmd_folio(*src_pmd);
+
+ if (!is_huge_zero_folio(folio) &&
+ !PageAnonExclusive(&folio->page)) {
+ spin_unlock(ptl);
+ err = -EBUSY;
+ break;
+ }
}
spin_unlock(ptl);
@@ -1782,18 +1896,18 @@ out_unlock:
up_read(&ctx->map_changing_lock);
uffd_move_unlock(dst_vma, src_vma);
out:
- VM_WARN_ON(moved < 0);
- VM_WARN_ON(err > 0);
- VM_WARN_ON(!moved && !err);
+ VM_WARN_ON_ONCE(moved < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!moved && !err);
return moved ? moved : err;
}
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
+ vm_flags_t vm_flags)
{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+ const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
- vm_flags_reset(vma, flags);
+ vm_flags_reset(vma, vm_flags);
/*
* For shared mappings, we want to enable writenotify while
* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
@@ -1805,12 +1919,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
static void userfaultfd_set_ctx(struct vm_area_struct *vma,
struct userfaultfd_ctx *ctx,
- unsigned long flags)
+ vm_flags_t vm_flags)
{
vma_start_write(vma);
vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
userfaultfd_set_vm_flags(vma,
- (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags);
+ (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
}
void userfaultfd_reset_ctx(struct vm_area_struct *vma)
@@ -1825,6 +1939,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1832,7 +1954,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1848,14 +1970,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
/* Assumes mmap write lock taken, and mm_struct pinned. */
int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
- unsigned long vm_flags,
+ vm_flags_t vm_flags,
unsigned long start, unsigned long end,
bool wp_async)
{
VMA_ITERATOR(vmi, ctx->mm, start);
struct vm_area_struct *prev = vma_prev(&vmi);
unsigned long vma_end;
- unsigned long new_flags;
+ vm_flags_t new_flags;
if (vma->vm_start < start)
prev = vma;
@@ -1863,10 +1985,10 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
@@ -1883,7 +2005,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -1941,8 +2064,8 @@ void userfaultfd_release_all(struct mm_struct *mm,
prev = NULL;
for_each_vma(vmi, vma) {
cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & __VM_UFFD_FLAGS));
if (vma->vm_userfaultfd_ctx.ctx != ctx) {
prev = vma;
continue;
diff --git a/mm/util.c b/mm/util.c
index 60aa40f612b8..98d301875b75 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,6 +12,7 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
@@ -23,6 +24,8 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
+#include <linux/page_idle.h>
#include <linux/uaccess.h>
@@ -563,12 +566,15 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
+ loff_t off = (loff_t)pgoff << PAGE_SHIFT;
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, off, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -582,6 +588,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
return ret;
}
+/*
+ * Perform a userland memory mapping into the current process address space. See
+ * the comment for do_mmap() for more details on this operation in general.
+ *
+ * This differs from do_mmap() in that:
+ *
+ * a. An offset parameter is provided rather than pgoff, which is both checked
+ * for overflow and page alignment.
+ * b. mmap locking is performed on the caller's behalf.
+ * c. Userfaultfd unmap events and memory population are handled.
+ *
+ * This means that this function performs essentially the same work as if
+ * userland were invoking mmap (2).
+ *
+ * Returns either an error, or the address at which the requested mapping has
+ * been performed.
+ */
unsigned long vm_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
@@ -595,168 +618,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
-{
- /*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- flags |= __GFP_NOWARN;
-
- if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- flags &= ~__GFP_NOFAIL;
- }
-
- return flags;
-}
-
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
-{
- void *ret;
-
- /*
- * It doesn't really make sense to fallback to vmalloc for sub page
- * requests
- */
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
- kmalloc_gfp_adjust(flags, size),
- node);
- if (ret || size <= PAGE_SIZE)
- return ret;
-
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
- /* Don't even allow crazy sizes */
- if (unlikely(size > INT_MAX)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- /*
- * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
- * since the callers already cannot assume anything
- * about the resulting pointer, and cannot play
- * protection games.
- */
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__kvmalloc_node_noprof);
-
-/**
- * kvfree() - Free memory.
- * @addr: Pointer to allocated memory.
- *
- * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
- * It is slightly more efficient to use kfree() or vfree() if you are certain
- * that you know which one to use.
- *
- * Context: Either preemptible task context or not-NMI interrupt.
- */
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-/**
- * kvfree_sensitive - Free a data object containing sensitive information.
- * @addr: address of the data object to be freed.
- * @len: length of the data object.
- *
- * Use the special memzero_explicit() function to clear the content of a
- * kvmalloc'ed object containing sensitive data to make sure that the
- * compiler won't optimize out the data clearing.
- */
-void kvfree_sensitive(const void *addr, size_t len)
-{
- if (likely(!ZERO_OR_NULL_PTR(addr))) {
- memzero_explicit((void *)addr, len);
- kvfree(addr);
- }
-}
-EXPORT_SYMBOL(kvfree_sensitive);
-
-/**
- * kvrealloc - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @flags: the flags for the page level allocator
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
-{
- void *n;
-
- if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
-
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
- if (!n) {
- /* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
- if (!n)
- return NULL;
-
- if (p) {
- /* We already know that `p` is not a vmalloc address. */
- kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
- kasan_enable_current();
-
- kfree(p);
- }
- }
-
- return n;
-}
-EXPORT_SYMBOL(kvrealloc_noprof);
-
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
@@ -811,9 +672,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
- if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
return NULL;
- return (void *)(mapping - PAGE_MAPPING_ANON);
+ return (void *)(mapping - FOLIO_MAPPING_ANON);
}
/**
@@ -840,7 +701,7 @@ struct address_space *folio_mapping(struct folio *folio)
return swap_address_space(folio->swap);
mapping = folio->mapping;
- if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
+ if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
return NULL;
return mapping;
@@ -889,14 +750,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src)
EXPORT_SYMBOL(folio_mc_copy);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
-int sysctl_overcommit_ratio __read_mostly = 50;
-unsigned long sysctl_overcommit_kbytes __read_mostly;
+static int sysctl_overcommit_ratio __read_mostly = 50;
+static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SYSCTL
+
+static int overcommit_ratio_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -911,8 +774,8 @@ static void sync_overcommit_as(struct work_struct *dummy)
percpu_counter_sync(&vm_committed_as);
}
-int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_policy_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int new_policy = -1;
@@ -947,8 +810,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
-int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -958,6 +821,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
+static const struct ctl_table util_sysctl_table[] = {
+ {
+ .procname = "overcommit_memory",
+ .data = &sysctl_overcommit_memory,
+ .maxlen = sizeof(sysctl_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = overcommit_policy_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "overcommit_ratio",
+ .data = &sysctl_overcommit_ratio,
+ .maxlen = sizeof(sysctl_overcommit_ratio),
+ .mode = 0644,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
+ },
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+};
+
+static int __init init_vm_util_sysctls(void)
+{
+ register_sysctl_init("vm", util_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_vm_util_sysctls);
+#endif /* CONFIG_SYSCTL */
+
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
@@ -1222,3 +1133,152 @@ void flush_dcache_folio(struct folio *folio)
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif
+
+/**
+ * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA
+ * @file: The file which possesss an f_op->mmap_prepare() hook
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ *
+ * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
+ * 'wrapper' file systems invoke a nested mmap hook of an underlying file.
+ *
+ * Until all filesystems are converted to use .mmap_prepare(), we must be
+ * conservative and continue to invoke these 'wrapper' filesystems using the
+ * deprecated .mmap() hook.
+ *
+ * However we have a problem if the underlying file system possesses an
+ * .mmap_prepare() hook, as we are in a different context when we invoke the
+ * .mmap() hook, already having a VMA to deal with.
+ *
+ * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * establishes a struct vm_area_desc descriptor, passes to the underlying
+ * .mmap_prepare() hook and applies any changes performed by it.
+ *
+ * Once the conversion of filesystems is complete this function will no longer
+ * be required and will be removed.
+ *
+ * Returns: 0 on success or error.
+ */
+int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc;
+ int err;
+
+ err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
+ if (err)
+ return err;
+ set_vma_from_desc(vma, &desc);
+
+ return 0;
+}
+EXPORT_SYMBOL(compat_vma_mmap_prepare);
+
+static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
+ const struct page *page)
+{
+ /*
+ * Only the first page of a high-order buddy page has PageBuddy() set.
+ * So we have to check manually whether this page is part of a high-
+ * order buddy page.
+ */
+ if (PageBuddy(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+
+ if (folio_test_idle(folio))
+ ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
+}
+
+/**
+ * snapshot_page() - Create a snapshot of a struct page
+ * @ps: Pointer to a struct page_snapshot to store the page snapshot
+ * @page: The page to snapshot
+ *
+ * Create a snapshot of the page and store both its struct page and struct
+ * folio representations in @ps.
+ *
+ * A snapshot is marked as "faithful" if the compound state of @page was
+ * stable and allowed safe reconstruction of the folio representation. In
+ * rare cases where this is not possible (e.g. due to folio splitting),
+ * snapshot_page() falls back to treating @page as a single page and the
+ * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
+ * helper can be used to check for this condition.
+ */
+void snapshot_page(struct page_snapshot *ps, const struct page *page)
+{
+ unsigned long head, nr_pages = 1;
+ struct folio *foliop;
+ int loops = 5;
+
+ ps->pfn = page_to_pfn(page);
+ ps->flags = PAGE_SNAPSHOT_FAITHFUL;
+
+again:
+ memset(&ps->folio_snapshot, 0, sizeof(struct folio));
+ memcpy(&ps->page_snapshot, page, sizeof(*page));
+ head = ps->page_snapshot.compound_head;
+ if ((head & 1) == 0) {
+ ps->idx = 0;
+ foliop = (struct folio *)&ps->page_snapshot;
+ if (!folio_test_large(foliop)) {
+ set_ps_flags(ps, page_folio(page), page);
+ memcpy(&ps->folio_snapshot, foliop,
+ sizeof(struct page));
+ return;
+ }
+ foliop = (struct folio *)page;
+ } else {
+ foliop = (struct folio *)(head - 1);
+ ps->idx = folio_page_idx(foliop, page);
+ }
+
+ if (ps->idx < MAX_FOLIO_NR_PAGES) {
+ memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
+ nr_pages = folio_nr_pages(&ps->folio_snapshot);
+ if (nr_pages > 1)
+ memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
+ sizeof(struct page));
+ set_ps_flags(ps, foliop, page);
+ }
+
+ if (ps->idx > nr_pages) {
+ if (loops-- > 0)
+ goto again;
+ clear_compound_head(&ps->page_snapshot);
+ foliop = (struct folio *)&ps->page_snapshot;
+ memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
+ ps->flags = 0;
+ ps->idx = 0;
+ }
+}
+
+#ifdef CONFIG_MMU
+/**
+ * folio_pte_batch - detect a PTE batch for a large folio
+ * @folio: The large folio to detect a PTE batch for.
+ * @ptep: Page table pointer for the first entry.
+ * @pte: Page table entry for the first page.
+ * @max_nr: The maximum number of table entries to consider.
+ *
+ * This is a simplified variant of folio_pte_batch_flags().
+ *
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same large folio in a single VMA and a single page table.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
+ *
+ * ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single VMA and
+ * a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
+ unsigned int max_nr)
+{
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
+}
+#endif /* CONFIG_MMU */
diff --git a/mm/vma.c b/mm/vma.c
index b126683397fc..3b12c7579831 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,11 +15,15 @@ struct mmap_state {
unsigned long end;
pgoff_t pgoff;
unsigned long pglen;
- unsigned long flags;
+ vm_flags_t vm_flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_prepare(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
- bool retry_merge;
struct vm_area_struct *prev;
struct vm_area_struct *next;
@@ -28,9 +32,12 @@ struct mmap_state {
struct vma_munmap_struct vms;
struct ma_state mas_detach;
struct maple_tree mt_detach;
+
+ /* Determine if we can check KSM flags early in mmap() logic. */
+ bool check_ksm_early;
};
-#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
struct mmap_state name = { \
.mm = mm_, \
.vmi = vmi_, \
@@ -38,8 +45,9 @@ struct mmap_state {
.end = (addr_) + (len_), \
.pgoff = pgoff_, \
.pglen = PHYS_PFN(len_), \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(vm_flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -48,16 +56,31 @@ struct mmap_state {
.vmi = (map_)->vmi, \
.start = (map_)->addr, \
.end = (map_)->end, \
- .flags = (map_)->flags, \
+ .vm_flags = (map_)->vm_flags, \
.pgoff = (map_)->pgoff, \
.file = (map_)->file, \
.prev = (map_)->prev, \
- .vma = vma_, \
+ .middle = vma_, \
.next = (vma_) ? NULL : (map_)->next, \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
+/*
+ * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
+ * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
+ * would mean a wider range of folios sharing the root anon_vma lock, and thus
+ * potential lock contention, we do not wish to encourage merging such that this
+ * scales to a problem.
+ */
+static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
+}
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -72,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
- if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
return false;
if (vma->vm_file != vmg->file)
return false;
@@ -83,53 +106,75 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return true;
}
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
+ struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct anon_vma *tgt_anon = tgt->anon_vma;
+ struct anon_vma *src_anon = vmg->anon_vma;
+
/*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
+ * will remove the existing VMA's anon_vma's so there's no scalability
+ * concerns.
*/
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
+ VM_WARN_ON(src && src_anon != src->anon_vma);
-/* Are the anon_vma's belonging to each VMA compatible with one another? */
-static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
- struct vm_area_struct *vma2)
-{
- return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+ /* Case 1 - we will dup_anon_vma() from src into tgt. */
+ if (!tgt_anon && src_anon)
+ return !vma_had_uncowed_parents(src);
+ /* Case 2 - we will simply use tgt's anon_vma. */
+ if (tgt_anon && !src_anon)
+ return !vma_had_uncowed_parents(tgt);
+ /* Case 3 - the anon_vma's are already shared. */
+ return src_anon == tgt_anon;
}
/*
* init_multi_vma_prep() - Initializer for struct vma_prepare
* @vp: The vma_prepare struct
* @vma: The vma that will be altered once locked
- * @next: The next vma if it is to be adjusted
- * @remove: The first vma to be removed
- * @remove2: The second vma to be removed
+ * @vmg: The merge state that will be used to determine adjustment and VMA
+ * removal.
*/
static void init_multi_vma_prep(struct vma_prepare *vp,
struct vm_area_struct *vma,
- struct vm_area_struct *next,
- struct vm_area_struct *remove,
- struct vm_area_struct *remove2)
+ struct vma_merge_struct *vmg)
{
+ struct vm_area_struct *adjust;
+ struct vm_area_struct **remove = &vp->remove;
+
memset(vp, 0, sizeof(struct vma_prepare));
vp->vma = vma;
vp->anon_vma = vma->anon_vma;
- vp->remove = remove;
- vp->remove2 = remove2;
- vp->adj_next = next;
- if (!vp->anon_vma && next)
- vp->anon_vma = next->anon_vma;
+
+ if (vmg && vmg->__remove_middle) {
+ *remove = vmg->middle;
+ remove = &vp->remove2;
+ }
+ if (vmg && vmg->__remove_next)
+ *remove = vmg->next;
+
+ if (vmg && vmg->__adjust_middle_start)
+ adjust = vmg->middle;
+ else if (vmg && vmg->__adjust_next_start)
+ adjust = vmg->next;
+ else
+ adjust = NULL;
+
+ vp->adj_next = adjust;
+ if (!vp->anon_vma && adjust)
+ vp->anon_vma = adjust->anon_vma;
+
+ VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
+ vp->anon_vma != adjust->anon_vma);
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -150,7 +195,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
@@ -170,7 +215,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
@@ -203,6 +248,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
}
/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_lock and by
+ * the root anon_vma's mutex.
+ */
+static void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+/*
* vma_prepare() - Helper function for handling locking VMAs prior to altering
* @vp: The initialized vma_prepare struct
*/
@@ -274,7 +351,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- vma_iter_store(vmi, vp->insert);
+ vma_iter_store_new(vmi, vp->insert);
mm->map_count++;
}
@@ -287,15 +364,18 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
again:
- vma_mark_detached(vp->remove, true);
+ vma_mark_detached(vp->remove);
if (vp->file) {
uprobe_munmap(vp->remove, vp->remove->vm_start,
vp->remove->vm_end);
@@ -330,7 +410,7 @@ again:
*/
static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
{
- init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+ init_multi_vma_prep(vp, vma, NULL);
}
/*
@@ -354,8 +434,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg)
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
bool can_merge_left)
{
- if (!vmg->next || vmg->end != vmg->next->vm_start ||
- !can_vma_merge_before(vmg))
+ struct vm_area_struct *next = vmg->next;
+ struct vm_area_struct *prev;
+
+ if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
return false;
if (!can_merge_left)
@@ -368,23 +450,22 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
*
* We therefore check this in addition to mergeability to either side.
*/
- return are_anon_vmas_compatible(vmg->prev, vmg->next);
+ prev = vmg->prev;
+ return !prev->anon_vma || !next->anon_vma ||
+ prev->anon_vma == next->anon_vma;
}
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma)
{
might_sleep();
vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- if (unreachable)
- __vm_area_free(vma);
- else
- vm_area_free(vma);
+ vm_area_free(vma);
}
/*
@@ -398,7 +479,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
@@ -415,8 +495,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
* has already been checked or doesn't make sense to fail.
* VMA Iterator will point to the original VMA.
*/
-static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static __must_check int
+__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
struct vma_prepare vp;
struct vm_area_struct *new;
@@ -467,7 +548,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -511,39 +599,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * vma has some anon_vma assigned, and is already inserted on that
- * anon_vma's interval trees.
+ * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
+ * instance that the destination VMA has no anon_vma but the source does.
*
- * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
- * vma must be removed from the anon_vma's interval trees using
- * anon_vma_interval_tree_pre_update_vma().
- *
- * After the update, the vma will be reinserted using
- * anon_vma_interval_tree_post_update_vma().
- *
- * The entire update must be protected by exclusive mmap_lock and by
- * the root anon_vma's mutex.
- */
-void
-anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
-}
-
-void
-anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
-}
-
-/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
* @dst: The destination VMA
* @src: The source VMA
* @dup: Pointer to the destination VMA when successful.
@@ -554,9 +612,18 @@ static int dup_anon_vma(struct vm_area_struct *dst,
struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
+ * There are three cases to consider for correctly propagating
+ * anon_vma's on merge.
+ *
+ * The first is trivial - neither VMA has anon_vma, we need not do
+ * anything.
+ *
+ * The second where both have anon_vma is also a no-op, as they must
+ * then be the same, so there is simply nothing to copy.
+ *
+ * Here we cover the third - if the destination VMA has no anon_vma,
+ * that is it is unfaulted, we need to ensure that the newly merged
+ * range is referenced by the anon_vma's of the source.
*/
if (src->anon_vma && !dst->anon_vma) {
int ret;
@@ -629,49 +696,75 @@ void validate_mm(struct mm_struct *mm)
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
-/* Actually perform the VMA merge operation. */
-static int commit_merge(struct vma_merge_struct *vmg,
- struct vm_area_struct *adjust,
- struct vm_area_struct *remove,
- struct vm_area_struct *remove2,
- long adj_start,
- bool expanded)
+/*
+ * Based on the vmg flag indicating whether we need to adjust the vm_start field
+ * for the middle or next VMA, we calculate what the range of the newly adjusted
+ * VMA ought to be, and set the VMA's range accordingly.
+ */
+static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
{
- struct vma_prepare vp;
+ struct vm_area_struct *adjust;
+ pgoff_t pgoff;
+
+ if (vmg->__adjust_middle_start) {
+ adjust = vmg->middle;
+ pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
+ } else if (vmg->__adjust_next_start) {
+ adjust = vmg->next;
+ pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
+ } else {
+ return;
+ }
- init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
+ vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
+}
- VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
- vp.anon_vma != adjust->anon_vma);
+/*
+ * Actually perform the VMA merge operation.
+ *
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
+ * Returns 0 on success, or an error value on failure.
+ */
+static int commit_merge(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma;
+ struct vma_prepare vp;
- if (expanded) {
- /* Note: vma iterator must be pointing to 'start'. */
- vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+ if (vmg->__adjust_next_start) {
+ /* We manipulate middle and adjust next, which is the target. */
+ vma = vmg->middle;
+ vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
} else {
- vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
- adjust->vm_end);
+ vma = vmg->target;
+ /* Note: vma iterator must be pointing to 'start'. */
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
}
- if (vma_iter_prealloc(vmg->vmi, vmg->vma))
+ init_multi_vma_prep(&vp, vma, vmg);
+
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
+ if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
vma_prepare(&vp);
- vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
- vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
-
- if (expanded)
- vma_iter_store(vmg->vmi, vmg->vma);
-
- if (adj_start) {
- adjust->vm_start += adj_start;
- adjust->vm_pgoff += PHYS_PFN(adj_start);
- if (adj_start < 0) {
- WARN_ON(expanded);
- vma_iter_store(vmg->vmi, adjust);
- }
- }
+ /*
+ * THP pages may need to do additional splits if we increase
+ * middle->vm_start.
+ */
+ vma_adjust_trans_huge(vma, vmg->start, vmg->end,
+ vmg->__adjust_middle_start ? vmg->middle : NULL);
+ vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
+ vmg_adjust_set_range(vmg);
+ vma_iter_store_overwrite(vmg->vmi, vmg->target);
- vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
+ vma_complete(&vp, vmg->vmi, vma->vm_mm);
return 0;
}
@@ -694,8 +787,9 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* identical properties.
*
* This function checks for the existence of any such mergeable VMAs and updates
- * the maple tree describing the @vmg->vma->vm_mm address space to account for
- * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ * the maple tree describing the @vmg->middle->vm_mm address space to account
+ * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
+ * merge.
*
* As part of this operation, if a merge occurs, the @vmg object will have its
* vma, start, end, and pgoff fields modified to execute the merge. Subsequent
@@ -704,43 +798,43 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* Returns: The merged VMA if merge succeeds, or NULL otherwise.
*
* ASSUMPTIONS:
- * - The caller must assign the VMA to be modifed to @vmg->vma.
+ * - The caller must assign the VMA to be modifed to @vmg->middle.
* - The caller must have set @vmg->prev to the previous VMA, if there is one.
* - The caller must not set @vmg->next, as we determine this.
* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
- * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
+ * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
*/
-static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
+static __must_check struct vm_area_struct *vma_merge_existing_range(
+ struct vma_merge_struct *vmg)
{
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *prev = vmg->prev;
- struct vm_area_struct *next, *res;
+ struct vm_area_struct *next;
struct vm_area_struct *anon_dup = NULL;
- struct vm_area_struct *adjust = NULL;
unsigned long start = vmg->start;
unsigned long end = vmg->end;
- bool left_side = vma && start == vma->vm_start;
- bool right_side = vma && end == vma->vm_end;
+ bool left_side = middle && start == middle->vm_start;
+ bool right_side = middle && end == middle->vm_end;
int err = 0;
- long adj_start = 0;
- bool merge_will_delete_vma, merge_will_delete_next;
bool merge_left, merge_right, merge_both;
- bool expanded;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
- VM_WARN_ON(vmg->next); /* We set this. */
- VM_WARN_ON(prev && start <= prev->vm_start);
- VM_WARN_ON(start >= end);
+ VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
+ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
+ VM_WARN_ON_VMG(start >= end, vmg);
+
/*
- * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+ * If middle == prev, then we are offset into a VMA. Otherwise, if we are
* not, we must span a portion of the VMA.
*/
- VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
- vmg->end > vma->vm_end));
- /* The vmi must be positioned within vmg->vma. */
- VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
- vma_iter_addr(vmg->vmi) < vma->vm_end));
+ VM_WARN_ON_VMG(middle &&
+ ((middle != prev && vmg->start != middle->vm_start) ||
+ vmg->end > middle->vm_end), vmg);
+ /* The vmi must be positioned within vmg->middle. */
+ VM_WARN_ON_VMG(middle &&
+ !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
+ vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -749,7 +843,7 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
* furthermost left or right side of the VMA, then we have no chance of
* merging and should abort.
*/
- if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
return NULL;
if (left_side)
@@ -774,49 +868,52 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
merge_both = merge_left && merge_right;
/* If we span the entire VMA, a merge implies it will be deleted. */
- merge_will_delete_vma = left_side && right_side;
+ vmg->__remove_middle = left_side && right_side;
/*
- * If we need to remove vma in its entirety but are unable to do so,
+ * If we need to remove middle in its entirety but are unable to do so,
* we have no sensible recourse but to abort the merge.
*/
- if (merge_will_delete_vma && !can_merge_remove_vma(vma))
+ if (vmg->__remove_middle && !can_merge_remove_vma(middle))
return NULL;
/*
* If we merge both VMAs, then next is also deleted. This implies
* merge_will_delete_vma also.
*/
- merge_will_delete_next = merge_both;
+ vmg->__remove_next = merge_both;
/*
* If we cannot delete next, then we can reduce the operation to merging
- * prev and vma (thereby deleting vma).
+ * prev and middle (thereby deleting middle).
*/
- if (merge_will_delete_next && !can_merge_remove_vma(next)) {
- merge_will_delete_next = false;
+ if (vmg->__remove_next && !can_merge_remove_vma(next)) {
+ vmg->__remove_next = false;
merge_right = false;
merge_both = false;
}
- /* No matter what happens, we will be adjusting vma. */
- vma_start_write(vma);
-
- if (merge_left)
- vma_start_write(prev);
+ /* No matter what happens, we will be adjusting middle. */
+ vma_start_write(middle);
- if (merge_right)
+ if (merge_right) {
vma_start_write(next);
+ vmg->target = next;
+ }
+
+ if (merge_left) {
+ vma_start_write(prev);
+ vmg->target = prev;
+ }
if (merge_both) {
/*
- * |<----->|
- * |-------*********-------|
- * prev vma next
- * extend delete delete
+ * |<-------------------->|
+ * |-------********-------|
+ * prev middle next
+ * extend delete delete
*/
- vmg->vma = prev;
vmg->start = prev->vm_start;
vmg->end = next->vm_end;
vmg->pgoff = prev->vm_pgoff;
@@ -824,97 +921,77 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
/*
* We already ensured anon_vma compatibility above, so now it's
* simply a case of, if prev has no anon_vma object, which of
- * next or vma contains the anon_vma we must duplicate.
+ * next or middle contains the anon_vma we must duplicate.
*/
- err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+ err = dup_anon_vma(prev, next->anon_vma ? next : middle,
+ &anon_dup);
} else if (merge_left) {
/*
- * |<----->| OR
- * |<--------->|
+ * |<------------>| OR
+ * |<----------------->|
* |-------*************
- * prev vma
+ * prev middle
* extend shrink/delete
*/
- vmg->vma = prev;
vmg->start = prev->vm_start;
vmg->pgoff = prev->vm_pgoff;
- if (!merge_will_delete_vma) {
- adjust = vma;
- adj_start = vmg->end - vma->vm_start;
- }
+ if (!vmg->__remove_middle)
+ vmg->__adjust_middle_start = true;
- err = dup_anon_vma(prev, vma, &anon_dup);
+ err = dup_anon_vma(prev, middle, &anon_dup);
} else { /* merge_right */
/*
- * |<----->| OR
- * |<--------->|
+ * |<------------->| OR
+ * |<----------------->|
* *************-------|
- * vma next
+ * middle next
* shrink/delete extend
*/
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
- VM_WARN_ON(!merge_right);
- /* If we are offset into a VMA, then prev must be vma. */
- VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+ VM_WARN_ON_VMG(!merge_right, vmg);
+ /* If we are offset into a VMA, then prev must be middle. */
+ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);
- if (merge_will_delete_vma) {
- vmg->vma = next;
+ if (vmg->__remove_middle) {
vmg->end = next->vm_end;
vmg->pgoff = next->vm_pgoff - pglen;
} else {
- /*
- * We shrink vma and expand next.
- *
- * IMPORTANT: This is the ONLY case where the final
- * merged VMA is NOT vmg->vma, but rather vmg->next.
- */
-
- vmg->start = vma->vm_start;
+ /* We shrink middle and expand next. */
+ vmg->__adjust_next_start = true;
+ vmg->start = middle->vm_start;
vmg->end = start;
- vmg->pgoff = vma->vm_pgoff;
-
- adjust = next;
- adj_start = -(vma->vm_end - start);
+ vmg->pgoff = middle->vm_pgoff;
}
- err = dup_anon_vma(next, vma, &anon_dup);
+ err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- /*
- * In nearly all cases, we expand vmg->vma. There is one exception -
- * merge_right where we partially span the VMA. In this case we shrink
- * the end of vmg->vma and adjust the start of vmg->next accordingly.
- */
- expanded = !merge_right || merge_will_delete_vma;
-
- if (commit_merge(vmg, adjust,
- merge_will_delete_vma ? vma : NULL,
- merge_will_delete_next ? next : NULL,
- adj_start, expanded)) {
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
- res = merge_left ? prev : next;
- khugepaged_enter_vma(res, vmg->flags);
-
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
- return res;
+ return vmg->target;
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -968,32 +1045,32 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
struct vm_area_struct *next = vmg->next;
unsigned long end = vmg->end;
bool can_merge_left, can_merge_right;
- bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON(vmg->vma);
+ VM_WARN_ON_VMG(vmg->middle, vmg);
+ VM_WARN_ON_VMG(vmg->target, vmg);
/* vmi must point at or before the gap. */
- VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
+ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
vmg->state = VMA_MERGE_NOMERGE;
/* Special VMAs are unmergeable, also if no prev/next. */
- if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+ if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
return NULL;
can_merge_left = can_vma_merge_left(vmg);
- can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
+ can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
vmg->end = next->vm_end;
- vmg->vma = next;
+ vmg->target = next;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
if (can_merge_left) {
vmg->start = prev->vm_start;
- vmg->vma = prev;
+ vmg->target = prev;
vmg->pgoff = prev->vm_pgoff;
/*
@@ -1005,7 +1082,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
vmg->end = end;
/* In expand-only case we are already positioned at prev. */
- if (!just_expand) {
+ if (!vmg->just_expand) {
/* Equivalent to going to the previous range. */
vma_prev(vmg->vmi);
}
@@ -1015,10 +1092,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* Now try to expand adjacent VMA(s). This takes care of removing the
* following VMA if we have VMAs on both sides.
*/
- if (vmg->vma && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->vma, vmg->flags);
+ if (vmg->target && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
- return vmg->vma;
+ return vmg->target;
}
return NULL;
@@ -1030,53 +1107,69 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* @vmg: Describes a VMA expansion operation.
*
* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
- * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
- * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
+ * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->target can expand and merge with
* vmg->next needs to be handled by the caller.
*
* Returns: 0 on success.
*
* ASSUMPTIONS:
- * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
- * - The caller must have set @vmg->vma and @vmg->next.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have set @vmg->target and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
+ VM_WARN_ON_VMG(!target, vmg);
+
mmap_assert_write_locked(vmg->mm);
- vma_start_write(vma);
- if (next && (vma != next) && (vmg->end == next->vm_end)) {
+ vma_start_write(target);
+ if (next && (target != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
/* This should already have been checked by this point. */
- VM_WARN_ON(!can_merge_remove_vma(next));
+ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
- ret = dup_anon_vma(vma, next, &anon_dup);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
+ ret = dup_anon_vma(target, next, &anon_dup);
if (ret)
return ret;
}
/* Not merging but overwriting any part of next is not handled. */
- VM_WARN_ON(next && !remove_next &&
- next != vma && vmg->end > next->vm_start);
+ VM_WARN_ON_VMG(next && !remove_next &&
+ next != target && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
+ VM_WARN_ON_VMG(target->vm_start < vmg->start ||
+ target->vm_end > vmg->end, vmg);
+
+ if (remove_next)
+ vmg->__remove_next = true;
- if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
+ if (commit_merge(vmg))
goto nomem;
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1108,7 +1201,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
+ vma_adjust_trans_huge(vma, start, end, NULL);
vma_iter_clear(vmi);
vma_set_range(vma, start, end, pgoff);
@@ -1130,7 +1223,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- lru_add_drain();
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
@@ -1198,7 +1290,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, /* unreachable = */ false);
+ remove_vma(vma);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -1220,7 +1312,7 @@ static void reattach_vmas(struct ma_state *mas_detach)
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- vma_mark_detached(vma, false);
+ vma_mark_attached(vma);
__mt_destroy(mas_detach->tree);
}
@@ -1259,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
}
/* Don't bother splitting the VMA if we can't unmap it anyway */
- if (!can_modify_vma(vms->vma)) {
+ if (vma_is_sealed(vms->vma)) {
error = -EPERM;
goto start_split_failed;
}
@@ -1279,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
for_each_vma_range(*(vms->vmi), next, vms->end) {
long nrpages;
- if (!can_modify_vma(next)) {
+ if (vma_is_sealed(next)) {
error = -EPERM;
goto modify_vma_failed;
}
@@ -1295,7 +1387,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
if (error)
goto munmap_gather_failed;
- vma_mark_detached(next, true);
+ vma_mark_detached(next);
nrpages = vma_pages(next);
vms->nr_pages += nrpages;
@@ -1507,7 +1599,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
*/
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *vma = vmg->middle;
unsigned long start = vmg->start;
unsigned long end = vmg->end;
struct vm_area_struct *merged;
@@ -1519,6 +1611,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
+
/* Split any preceding portion of the VMA. */
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
@@ -1541,27 +1640,25 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
struct vm_area_struct *vma_modify_flags(
struct vma_iterator *vmi, struct vm_area_struct *prev,
struct vm_area_struct *vma, unsigned long start, unsigned long end,
- unsigned long new_flags)
+ vm_flags_t vm_flags)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
return vma_modify(&vmg);
}
struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
+*vma_modify_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
- unsigned long new_flags,
struct anon_vma_name *new_name)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
vmg.anon_name = new_name;
return vma_modify(&vmg);
@@ -1586,13 +1683,16 @@ struct vm_area_struct
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
@@ -1608,7 +1708,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
vmg.next = vma_iter_next_rewind(vmi, NULL);
- vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
+ vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */
return vma_merge_new_range(&vmg);
}
@@ -1693,7 +1793,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
return -ENOMEM;
vma_start_write(vma);
- vma_iter_store(&vmi, vma);
+ vma_iter_store_new(&vmi, vma);
vma_link_file(vma);
mm->map_count++;
validate_mm(mm);
@@ -1725,11 +1825,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- vmg.vma = NULL; /* New VMA range. */
+ vmg.middle = NULL; /* New VMA range. */
vmg.pgoff = pgoff;
vmg.next = vma_iter_next_rewind(&vmi, NULL);
new_vma = vma_merge_new_range(&vmg);
@@ -1776,6 +1884,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
@@ -2214,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
vms_complete_munmap_vmas(vms, mas_detach);
}
+static void update_ksm_flags(struct mmap_state *map)
+{
+ map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
+}
+
/*
* __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
* unmapped once the map operation is completed, check limits, account mapping
@@ -2256,11 +2370,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
/* Check against address space limit. */
- if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+ if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
return -ENOMEM;
/* Private writable mapping: check memory availability. */
- if (accountable_mapping(map->file, map->flags)) {
+ if (accountable_mapping(map->file, map->vm_flags)) {
map->charged = map->pglen;
map->charged -= vms->nr_accounted;
if (map->charged) {
@@ -2270,7 +2384,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
vms->nr_accounted = 0;
- map->flags |= VM_ACCOUNT;
+ map->vm_flags |= VM_ACCOUNT;
}
/*
@@ -2292,6 +2406,10 @@ static int __mmap_new_file_vma(struct mmap_state *map,
int error;
vma->vm_file = get_file(map->file);
+
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2310,13 +2428,12 @@ static int __mmap_new_file_vma(struct mmap_state *map,
* Drivers should not permit writability when previously it was
* disallowed.
*/
- VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
- !(map->flags & VM_MAYWRITE) &&
+ VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
+ !(map->vm_flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
- /* If the flags change (and are mergeable), let's retry later. */
- map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
- map->flags = vma->vm_flags;
+ map->file = vma->vm_file;
+ map->vm_flags = vma->vm_flags;
return 0;
}
@@ -2347,8 +2464,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
- vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vm_flags_init(vma, map->vm_flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2357,7 +2474,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (map->file)
error = __mmap_new_file_vma(map, vma);
- else if (map->flags & VM_SHARED)
+ else if (map->vm_flags & VM_SHARED)
error = shmem_zero_setup(vma);
else
vma_set_anonymous(vma);
@@ -2365,14 +2482,19 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (error)
goto free_iter_vma;
+ if (!map->check_ksm_early) {
+ update_ksm_flags(map);
+ vm_flags_init(vma, map->vm_flags);
+ }
+
#ifdef CONFIG_SPARC64
/* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(map->flags));
+ WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
#endif
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
- vma_iter_store(vmi, vma);
+ vma_iter_store_new(vmi, vma);
map->mm->map_count++;
vma_link_file(vma);
@@ -2380,8 +2502,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
- khugepaged_enter_vma(vma, map->flags);
- ksm_add_vma(vma);
+ if (!vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma, map->vm_flags);
*vmap = vma;
return 0;
@@ -2402,7 +2524,7 @@ free_vma:
static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
{
struct mm_struct *mm = map->mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
perf_event_mmap(vma);
@@ -2434,20 +2556,107 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
-unsigned long __mmap_region(struct file *file, unsigned long addr,
+/*
+ * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values.
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_mmap_prepare(struct mmap_state *map)
+{
+ int err;
+ struct vm_area_desc desc = {
+ .mm = map->mm,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .file = map->file,
+ .vm_flags = map->vm_flags,
+ .page_prot = map->page_prot,
+ };
+
+ /* Invoke the hook. */
+ err = vfs_mmap_prepare(map->file, &desc);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = desc.pgoff;
+ map->file = desc.file;
+ map->vm_flags = desc.vm_flags;
+ map->page_prot = desc.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = desc.vm_ops;
+ map->vm_private_data = desc.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ if (map->vm_ops)
+ vma->vm_ops = map->vm_ops;
+ vma->vm_private_data = map->vm_private_data;
+}
+
+/*
+ * Are we guaranteed no driver can change state such as to preclude KSM merging?
+ * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
+ */
+static bool can_set_ksm_flags_early(struct mmap_state *map)
+{
+ struct file *file = map->file;
+
+ /* Anonymous mappings have no driver which can change them. */
+ if (!file)
+ return true;
+
+ /*
+ * If .mmap_prepare() is specified, then the driver will have already
+ * manipulated state prior to updating KSM flags. So no need to worry
+ * about mmap callbacks modifying VMA flags after the KSM flag has been
+ * updated here, which could otherwise affect KSM eligibility.
+ */
+ if (file->f_op->mmap_prepare)
+ return true;
+
+ /* shmem is safe. */
+ if (shmem_file(file))
+ return true;
+
+ /* Any other .mmap callback is not safe. */
+ return false;
+}
+
+static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
int error;
+ bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ map.check_ksm_early = can_set_ksm_flags_early(&map);
+
error = __mmap_prepare(&map, uf);
+ if (!error && have_mmap_prepare)
+ error = call_mmap_prepare(&map);
if (error)
goto abort_munmap;
+ if (map.check_ksm_early)
+ update_ksm_flags(&map);
+
/* Attempt to merge with adjacent VMAs... */
if (map.prev || map.next) {
VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
@@ -2462,16 +2671,8 @@ unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
- /* If flags changed, we might be able to merge, so try again. */
- if (map.retry_merge) {
- struct vm_area_struct *merged;
- VMG_MMAP_STATE(vmg, &map, vma);
-
- vma_iter_config(map.vmi, map.addr, map.end);
- merged = vma_merge_existing_range(&vmg);
- if (merged)
- vma = merged;
- }
+ if (have_mmap_prepare)
+ set_vma_user_defined_fields(vma, &map);
__mmap_complete(&map, vma);
@@ -2485,3 +2686,518 @@ abort_munmap:
vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
return error;
}
+
+/**
+ * mmap_region() - Actually perform the userland mapping of a VMA into
+ * current->mm with known, aligned and overflow-checked @addr and @len, and
+ * correctly determined VMA flags @vm_flags and page offset @pgoff.
+ *
+ * This is an internal memory management function, and should not be used
+ * directly.
+ *
+ * The caller must write-lock current->mm->mmap_lock.
+ *
+ * @file: If a file-backed mapping, a pointer to the struct file describing the
+ * file to be mapped, otherwise NULL.
+ * @addr: The page-aligned address at which to perform the mapping.
+ * @len: The page-aligned, non-zero, length of the mapping.
+ * @vm_flags: The VMA flags which should be applied to the mapping.
+ * @pgoff: If @file is specified, the page offset into the file, if not then
+ * the virtual page offset in memory of the anonymous mapping.
+ * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
+ * events.
+ *
+ * Returns: Either an error, or the address at which the requested mapping has
+ * been performed.
+ */
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ unsigned long ret;
+ bool writable_file_mapping = false;
+
+ mmap_assert_write_locked(current->mm);
+
+ /* Check to see if MDWE is applicable. */
+ if (map_deny_write_exec(vm_flags, vm_flags))
+ return -EACCES;
+
+ /* Allow architectures to sanity-check the vm_flags. */
+ if (!arch_validate_flags(vm_flags))
+ return -EINVAL;
+
+ /* Map writable and ensure this isn't a sealed memfd. */
+ if (file && is_shared_maywrite(vm_flags)) {
+ int error = mapping_map_writable(file->f_mapping);
+
+ if (error)
+ return error;
+ writable_file_mapping = true;
+ }
+
+ ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
+
+ /* Clear our write mapping regardless of error. */
+ if (writable_file_mapping)
+ mapping_unmap_writable(file->f_mapping);
+
+ validate_mm(current->mm);
+ return ret;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @vmi: The vma iterator
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @vm_flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, vm_flags_t vm_flags)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ /*
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
+ */
+ if (vma && vma->vm_end == addr) {
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
+
+ vmg.prev = vma;
+ /* vmi is positioned at prev, which this mode expects. */
+ vmg.just_expand = true;
+
+ if (vma_merge_new_range(&vmg))
+ goto out;
+ else if (vmg_nomem(&vmg))
+ goto unacct_fail;
+ }
+
+ if (vma)
+ vma_iter_next_range(vmi);
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto unacct_fail;
+
+ vma_set_anonymous(vma);
+ vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
+ vm_flags_init(vma, vm_flags);
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma_start_write(vma);
+ if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
+ validate_mm(mm);
+out:
+ perf_event_mmap(vma);
+ mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
+ if (vm_flags & VM_LOCKED)
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+unacct_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+}
+
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ unsigned long length, gap;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask + info->start_gap;
+ if (length < info->length)
+ return -ENOMEM;
+
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
+ return -ENOMEM;
+
+ /*
+ * Adjust for the gap first so it doesn't interfere with the
+ * later alignment. The first step is the minimum needed to
+ * fulill the start gap, the next steps is the minimum to align
+ * that. It is the minimum needed to fulill both.
+ */
+ gap = vma_iter_addr(&vmi) + info->start_gap;
+ gap += (info->align_offset - gap) & info->align_mask;
+ tmp = vma_next(&vmi);
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap + length - 1) {
+ low_limit = tmp->vm_end;
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ } else {
+ tmp = vma_prev(&vmi);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ low_limit = vm_end_gap(tmp);
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ }
+
+ return gap;
+}
+
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ unsigned long length, gap, gap_end;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask + info->start_gap;
+ if (length < info->length)
+ return -ENOMEM;
+
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
+ return -ENOMEM;
+
+ gap = vma_iter_end(&vmi) - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ gap_end = vma_iter_end(&vmi);
+ tmp = vma_next(&vmi);
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap_end) {
+ high_limit = vm_start_gap(tmp);
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ } else {
+ tmp = vma_prev(&vmi);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ high_limit = tmp->vm_start;
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ }
+
+ return gap;
+}
+
+/*
+ * Verify that the stack growth is acceptable and
+ * update accounting. This is shared with both the
+ * grow-up and grow-down cases.
+ */
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long new_start;
+
+ /* address space limit tests */
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
+ return -ENOMEM;
+
+ /* Stack limit test */
+ if (size > rlimit(RLIMIT_STACK))
+ return -ENOMEM;
+
+ /* mlock limit tests */
+ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ return -ENOMEM;
+
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
+ /*
+ * Overcommit.. This must be the final test, as it will
+ * update security statistics.
+ */
+ if (security_vm_enough_memory_mm(mm, grow))
+ return -ENOMEM;
+
+ return 0;
+}
+
+#if defined(CONFIG_STACK_GROWSUP)
+/*
+ * PA-RISC uses this for its stack.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
+ int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
+
+ if (!(vma->vm_flags & VM_GROWSUP))
+ return -EFAULT;
+
+ mmap_assert_write_locked(mm);
+
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= (TASK_SIZE & PAGE_MASK))
+ return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
+ if (next)
+ vma_iter_prev_range_limit(&vmi, address);
+
+ vma_iter_config(&vmi, vma->vm_start, address);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma))) {
+ vma_iter_free(&vmi);
+ return -ENOMEM;
+ }
+
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
+ /* We update the anon VMA tree. */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address > vma->vm_end) {
+ unsigned long size, grow;
+
+ size = address - vma->vm_start;
+ grow = (address - vma->vm_end) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_iter_store_overwrite(&vmi, vma);
+ anon_vma_interval_tree_post_update_vma(vma);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ vma_iter_free(&vmi);
+ validate_mm(mm);
+ return error;
+}
+#endif /* CONFIG_STACK_GROWSUP */
+
+/*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ * mmap_lock held for writing.
+ */
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
+
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return -EFAULT;
+
+ mmap_assert_write_locked(mm);
+
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
+ prev = vma_prev(&vmi);
+ /* Check that both stack segments have the same anon_vma? */
+ if (prev) {
+ if (!(prev->vm_flags & VM_GROWSDOWN) &&
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
+ }
+
+ if (prev)
+ vma_iter_next_range_limit(&vmi, vma->vm_start);
+
+ vma_iter_config(&vmi, address, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma))) {
+ vma_iter_free(&vmi);
+ return -ENOMEM;
+ }
+
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
+ /* We update the anon VMA tree. */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address < vma->vm_start) {
+ unsigned long size, grow;
+
+ size = vma->vm_end - address;
+ grow = (vma->vm_start - address) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_iter_store_overwrite(&vmi, vma);
+ anon_vma_interval_tree_post_update_vma(vma);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ vma_iter_free(&vmi);
+ validate_mm(mm);
+ return error;
+}
+
+int __vm_munmap(unsigned long start, size_t len, bool unlock)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, start);
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
+ if (ret || !unlock)
+ mmap_write_unlock(mm);
+
+ userfaultfd_unmap_complete(mm, &uf);
+ return ret;
+}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long charged = vma_pages(vma);
+
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+ return -ENOMEM;
+
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ if (vma_link(mm, vma)) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/mm/vma.h b/mm/vma.h
index 388d34748674..b123a9cdedb0 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -58,35 +60,96 @@ enum vma_merge_state {
VMA_MERGE_SUCCESS,
};
-enum vma_merge_flags {
- VMG_FLAG_DEFAULT = 0,
- /*
- * If we can expand, simply do so. We know there is nothing to merge to
- * the right. Does not reset state upon failure to merge. The VMA
- * iterator is assumed to be positioned at the previous VMA, rather than
- * at the gap.
- */
- VMG_FLAG_JUST_EXPAND = 1 << 0,
-};
-
-/* Represents a VMA merge operation. */
+/*
+ * Describes a VMA merge operation and is threaded throughout it.
+ *
+ * Any of the fields may be mutated by the merge operation, so no guarantees are
+ * made to the contents of this structure after a merge operation has completed.
+ */
struct vma_merge_struct {
struct mm_struct *mm;
struct vma_iterator *vmi;
- pgoff_t pgoff;
+ /*
+ * Adjacent VMAs, any of which may be NULL if not present:
+ *
+ * |------|--------|------|
+ * | prev | middle | next |
+ * |------|--------|------|
+ *
+ * middle may not yet exist in the case of a proposed new VMA being
+ * merged, or it may be an existing VMA.
+ *
+ * next may be assigned by the caller.
+ */
struct vm_area_struct *prev;
- struct vm_area_struct *next; /* Modified by vma_merge(). */
- struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+ struct vm_area_struct *middle;
+ struct vm_area_struct *next;
+ /* This is the VMA we ultimately target to become the merged VMA. */
+ struct vm_area_struct *target;
+ /*
+ * Initially, the start, end, pgoff fields are provided by the caller
+ * and describe the proposed new VMA range, whether modifying an
+ * existing VMA (which will be 'middle'), or adding a new one.
+ *
+ * During the merge process these fields are updated to describe the new
+ * range _including those VMAs which will be merged_.
+ */
unsigned long start;
unsigned long end;
- unsigned long flags;
+ pgoff_t pgoff;
+
+ vm_flags_t vm_flags;
struct file *file;
struct anon_vma *anon_vma;
struct mempolicy *policy;
struct vm_userfaultfd_ctx uffd_ctx;
struct anon_vma_name *anon_name;
- enum vma_merge_flags merge_flags;
enum vma_merge_state state;
+
+ /* Flags which callers can use to modify merge behaviour: */
+
+ /*
+ * If we can expand, simply do so. We know there is nothing to merge to
+ * the right. Does not reset state upon failure to merge. The VMA
+ * iterator is assumed to be positioned at the previous VMA, rather than
+ * at the gap.
+ */
+ bool just_expand :1;
+
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
+
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
+ /* Internal flags set during merge process: */
+
+ /*
+ * Internal flag indicating the merge increases vmg->middle->vm_start
+ * (and thereby, vmg->prev->vm_end).
+ */
+ bool __adjust_middle_start :1;
+ /*
+ * Internal flag indicating the merge decreases vmg->next->vm_start
+ * (and thereby, vmg->middle->vm_end).
+ */
+ bool __adjust_next_start :1;
+ /*
+ * Internal flag used during the merge operation to indicate we will
+ * remove vmg->middle.
+ */
+ bool __remove_middle :1;
+ /*
+ * Internal flag used during the merge operationr to indicate we will
+ * remove vmg->next.
+ */
+ bool __remove_next :1;
+
};
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
@@ -101,16 +164,15 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
}
-#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \
+#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \
struct vma_merge_struct name = { \
.mm = mm_, \
.vmi = vmi_, \
.start = start_, \
.end = end_, \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
@@ -118,11 +180,11 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.mm = vma_->vm_mm, \
.vmi = vmi_, \
.prev = prev_, \
+ .middle = vma_, \
.next = NULL, \
- .vma = vma_, \
.start = start_, \
.end = end_, \
- .flags = vma_->vm_flags, \
+ .vm_flags = vma_->vm_flags, \
.pgoff = vma_pgoff_offset(vma_, start_), \
.file = vma_->vm_file, \
.anon_vma = vma_->anon_vma, \
@@ -130,7 +192,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
.anon_name = anon_vma_name(vma_), \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -139,15 +200,10 @@ void validate_mm(struct mm_struct *mm);
#define validate_mm(mm) do { } while (0)
#endif
-/* Required for expand_downwards(). */
-void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
-
-/* Required for expand_downwards(). */
-void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
-
-int vma_expand(struct vma_merge_struct *vmg);
-int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
+__must_check int vma_expand(struct vma_merge_struct *vmg);
+__must_check int vma_shrink(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff);
static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
struct vm_area_struct *vma, gfp_t gfp)
@@ -162,9 +218,57 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
+ vma_mark_attached(vma);
return 0;
}
+
+/*
+ * Temporary helper functions for file systems which wrap an invocation of
+ * f_op->mmap() but which might have an underlying file system which implements
+ * f_op->mmap_prepare().
+ */
+
+static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ desc->mm = vma->vm_mm;
+ desc->start = vma->vm_start;
+ desc->end = vma->vm_end;
+
+ desc->pgoff = vma->vm_pgoff;
+ desc->file = vma->vm_file;
+ desc->vm_flags = vma->vm_flags;
+ desc->page_prot = vma->vm_page_prot;
+
+ desc->vm_ops = NULL;
+ desc->private_data = NULL;
+
+ return desc;
+}
+
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ /*
+ * Since we're invoking .mmap_prepare() despite having a partially
+ * established VMA, we must take care to handle setting fields
+ * correctly.
+ */
+
+ /* Mutable fields. Populated with initial state. */
+ vma->vm_pgoff = desc->pgoff;
+ if (vma->vm_file != desc->file)
+ vma_set_file(vma, desc->file);
+ if (vma->vm_flags != desc->vm_flags)
+ vm_flags_set(vma, desc->vm_flags);
+ vma->vm_page_prot = desc->page_prot;
+
+ /* User-defined fields. */
+ vma->vm_ops = desc->vm_ops;
+ vma->vm_private_data = desc->private_data;
+}
+
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
@@ -174,29 +278,29 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
/* We are about to modify the VMA's flags. */
-struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+__must_check struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags);
+ vm_flags_t vm_flags);
-/* We are about to modify the VMA's flags and/or anon_name. */
-struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long new_flags,
- struct anon_vma_name *new_name);
+/* We are about to modify the VMA's anon_name. */
+__must_check struct vm_area_struct
+*vma_modify_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ struct anon_vma_name *new_name);
/* We are about to modify the VMA's memory policy. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
@@ -204,19 +308,22 @@ struct vm_area_struct
struct mempolicy *new_pol);
/* We are about to modify the VMA's flags and/or uffd context. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
-struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
+__must_check struct vm_area_struct
+*vma_merge_new_range(struct vma_merge_struct *vmg);
-struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta);
+__must_check struct vm_area_struct
+*vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta);
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
@@ -243,10 +350,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);
-unsigned long __mmap_region(struct file *file, unsigned long addr,
+unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf);
+int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
+
+unsigned long unmapped_area(struct vm_unmapped_area_info *info);
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
+
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
/*
@@ -261,7 +374,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma
}
#ifdef CONFIG_MMU
-static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags)
{
return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}
@@ -360,9 +473,10 @@ static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
}
/* Store a VMA with preallocated memory */
-static inline void vma_iter_store(struct vma_iterator *vmi,
- struct vm_area_struct *vma)
+static inline void vma_iter_store_overwrite(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
{
+ vma_assert_attached(vma);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
@@ -387,6 +501,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
mas_store_prealloc(&vmi->mas, vma);
}
+static inline void vma_iter_store_new(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+ vma_mark_attached(vma);
+ vma_iter_store_overwrite(vmi, vma);
+}
+
static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
return vmi->mas.index;
@@ -438,38 +559,38 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
}
#ifdef CONFIG_64BIT
-
static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
return (vma->vm_flags & VM_SEALED);
}
-
-/*
- * check if a vma is sealed for modification.
- * return true, if modification is allowed.
- */
-static inline bool can_modify_vma(struct vm_area_struct *vma)
+#else
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
- if (unlikely(vma_is_sealed(vma)))
- return false;
-
- return true;
+ return false;
}
+#endif
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
+#if defined(CONFIG_STACK_GROWSUP)
+int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+#endif
-#else
+int expand_downwards(struct vm_area_struct *vma, unsigned long address);
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- return true;
-}
+int __vm_munmap(unsigned long start, size_t len, bool unlock);
-static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
- return true;
-}
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+/* vma_init.h, shared between CONFIG_MMU and nommu. */
+void __init vma_state_init(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
+void vm_area_free(struct vm_area_struct *vma);
+
+/* vma_exec.c */
+#ifdef CONFIG_MMU
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
#endif
#endif /* __MM_VMA_H */
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
new file mode 100644
index 000000000000..922ee51747a6
--- /dev/null
+++ b/mm/vma_exec.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Functions explicitly implemented for exec functionality which however are
+ * explicitly VMA-only logic.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.target = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
+ return -ENOMEM;
+
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
+
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p)
+{
+ int err;
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma_set_anonymous(vma);
+
+ if (mmap_write_lock_killable(mm)) {
+ err = -EINTR;
+ goto err_free;
+ }
+
+ /*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
+ * Place the stack at the largest stack address the architecture
+ * supports. Later, we'll move this to an appropriate place. We don't
+ * use STACK_TOP because that can depend on attributes which aren't
+ * configured yet.
+ */
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_end = STACK_TOP_MAX;
+ vma->vm_start = vma->vm_end - PAGE_SIZE;
+ vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ err = insert_vm_struct(mm, vma);
+ if (err)
+ goto err;
+
+ mm->stack_vm = mm->total_vm = 1;
+ mmap_write_unlock(mm);
+ *vmap = vma;
+ *top_mem_p = vma->vm_end - sizeof(void *);
+ return 0;
+
+err:
+ ksm_exit(mm);
+err_ksm:
+ mmap_write_unlock(mm);
+err_free:
+ *vmap = NULL;
+ vm_area_free(vma);
+ return err;
+}
diff --git a/mm/vma_init.c b/mm/vma_init.c
new file mode 100644
index 000000000000..8e53c7943561
--- /dev/null
+++ b/mm/vma_init.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+void __init vma_state_init(void)
+{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
+}
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+#ifdef __HAVE_PFNMAP_TRACKING
+ dest->pfnmap_track_ctx = NULL;
+#endif
+}
+
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return 0;
+
+ /*
+ * We don't expect to ever hit this. If ever required, we would have
+ * to duplicate the tracking.
+ */
+ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
+ return -ENOMEM;
+ kref_get(&ctx->kref);
+ new->pfnmap_track_ctx = ctx;
+ return 0;
+}
+
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+ struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return;
+
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ vma->pfnmap_track_ctx = NULL;
+}
+#else
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ return 0;
+}
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+}
+#endif
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+
+ if (vma_pfnmap_track_ctx_dup(orig, new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
+ }
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_pfnmap_track_ctx_release(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index fc5f172a36bd..2f05735ff190 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -35,6 +35,7 @@
#include <linux/mutex.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
+#include <linux/personality.h>
#include <linux/pfn.h>
#include <linux/rcupdate.h>
#include <linux/rmap.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b6e8e39af653..5edd536ba9d2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
@@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -487,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -497,21 +525,33 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(ptep_get(pte))))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
- if (WARN_ON(!pfn_valid(page_to_pfn(page))))
- return -EINVAL;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
- return 0;
+
+ return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
@@ -900,6 +940,11 @@ static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
@@ -918,6 +963,19 @@ id_to_node(unsigned int id)
return &vmap_nodes[id % nr_vmap_nodes];
}
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
+
/*
* We use the value 0 to represent "no node", that is why
* an encoded value will be the node-id incremented by 1.
@@ -990,7 +1048,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
@@ -1056,12 +1115,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
- int i;
repeat:
- for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ va_start_lowest = 0;
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
@@ -1698,7 +1756,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
- return -1;
+ return -ENOMEM;
}
/*
@@ -1712,7 +1770,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
va->va_start = nva_start_addr + size;
} else {
- return -1;
+ return -EINVAL;
}
if (type != FL_FIT_TYPE) {
@@ -1741,19 +1799,19 @@ va_alloc(struct vmap_area *va,
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
- return vend;
+ return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
- return vend;
+ return ret;
return nva_start_addr;
}
/*
* Returns a start address of the newly allocated area, if success.
- * Otherwise a vend is returned that indicates failure.
+ * Otherwise an error value is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
@@ -1778,14 +1836,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
- return vend;
+ return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
- if (nva_start_addr == vend)
- return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(root, head, size, align);
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1915,7 +1972,7 @@ node_alloc(unsigned long size, unsigned long align,
struct vmap_area *va;
*vn_id = 0;
- *addr = vend;
+ *addr = -EINVAL;
/*
* Fallback to a global heap if not vmalloc or there
@@ -1940,7 +1997,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va_size(va);
+ vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -1969,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
+ /* Only reclaim behaviour flags are relevant. */
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
might_sleep();
/*
@@ -1981,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
*/
va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
if (!va) {
- gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
-
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1995,20 +2052,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
}
retry:
- if (addr == vend) {
+ if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
}
- trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
/*
- * If an allocation fails, the "vend" address is
+ * If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (unlikely(addr == vend))
+ if (IS_ERR_VALUE(addr))
goto overflow;
va->va_start = addr;
@@ -2032,7 +2089,7 @@ retry:
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
- ret = kasan_populate_vmalloc(addr, size);
+ ret = kasan_populate_vmalloc(addr, size, gfp_mask);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
@@ -2100,8 +2157,6 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
-
/*
* Serialize vmap purging. There is no actual critical section protected
* by this lock, but we want to avoid concurrent calls for performance
@@ -2111,7 +2166,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-static cpumask_t purge_nodes;
static void
reclaim_list_global(struct list_head *head)
@@ -2134,7 +2188,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
LIST_HEAD(decay_list);
struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- unsigned long n_decay;
+ unsigned long n_decay, pool_len;
int i;
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
@@ -2148,22 +2202,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
- if (full_decay)
- WRITE_ONCE(vn->pool[i].len, 0);
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
/* Decay a pool by ~25% out of left objects. */
- n_decay = vn->pool[i].len >> 2;
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
-
- if (!full_decay) {
- WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
-
- if (!--n_decay)
- break;
- }
}
/*
@@ -2172,9 +2224,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
* can populate the pool therefore a simple list replace
* operation takes place here.
*/
- if (!full_decay && !list_empty(&tmp_list)) {
+ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
@@ -2244,6 +2297,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
@@ -2255,9 +2309,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
*/
purge_nodes = CPU_MASK_NONE;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
-
+ for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
@@ -2276,7 +2328,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
- cpumask_set_cpu(i, &purge_nodes);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
@@ -2355,7 +2407,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
&vmap_lazy_nr);
/*
@@ -2421,7 +2473,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2447,7 +2499,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2916,10 +2968,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
*/
void vm_unmap_aliases(void)
{
- unsigned long start = ULONG_MAX, end = 0;
- int flush = 0;
-
- _vm_unmap_aliases(start, end, flush);
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -3100,7 +3149,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
@@ -3133,6 +3182,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size,
area->flags = flags;
area->caller = caller;
+ area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
@@ -3370,12 +3420,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3562,11 +3613,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy_noprof(gfp,
+ nr = alloc_pages_bulk_mempolicy_noprof(gfp,
nr_pages_request,
pages + nr_allocated);
else
- nr = alloc_pages_bulk_array_node_noprof(gfp, nid,
+ nr = alloc_pages_bulk_node_noprof(gfp, nid,
nr_pages_request,
pages + nr_allocated);
@@ -3671,12 +3722,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (gfp_mask & __GFP_ACCOUNT) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
- }
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
/*
* If not enough pages were obtained to accomplish an
@@ -3771,8 +3820,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
struct vm_struct *area;
void *ret;
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
- unsigned long real_size = size;
- unsigned long real_align = align;
+ unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
@@ -3781,7 +3829,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
if ((size >> PAGE_SHIFT) > totalram_pages()) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
- real_size);
+ size);
return NULL;
}
@@ -3798,19 +3846,18 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
else
shift = arch_vmap_pte_supported_shift(size);
- align = max(real_align, 1UL << shift);
- size = ALIGN(real_size, 1UL << shift);
+ align = max(original_align, 1UL << shift);
}
again:
- area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed%s",
- real_size, (nofail) ? ". Retrying." : "");
+ size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
@@ -3860,7 +3907,7 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -3869,17 +3916,15 @@ again:
*/
clear_vm_uninitialized_flag(area);
- size = PAGE_ALIGN(size);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
- kmemleak_vmalloc(area, size, gfp_mask);
+ kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);
return area->addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
- align = real_align;
- size = real_size;
+ align = original_align;
goto again;
}
@@ -3947,9 +3992,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3958,13 +4004,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4067,6 +4113,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
*/
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
size_t old_size = 0;
void *n;
@@ -4076,15 +4124,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
}
if (p) {
- struct vm_struct *vm;
-
vm = find_vm_area(p);
if (unlikely(!vm)) {
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
return NULL;
}
- old_size = get_vm_area_size(vm);
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
}
/*
@@ -4092,11 +4142,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out spare memory. */
- if (want_init_on_alloc(flags))
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
- kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
+ return (void *)p;
+ }
+
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
return (void *)p;
}
@@ -4761,7 +4826,7 @@ retry:
/* populate the kasan shadow space */
for (area = 0; area < nr_vms; area++) {
- if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
goto err_free_shadow;
}
@@ -4918,39 +4983,37 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
- int i;
-
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
@@ -4966,11 +5029,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
- int i;
+ unsigned int *counters;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
@@ -4983,6 +5047,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -5017,7 +5086,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -5027,19 +5098,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
@@ -5091,7 +5157,7 @@ static void __init vmap_init_free_space(void)
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
- int i, n;
+ int i;
#if BITS_PER_LONG == 64
/*
@@ -5108,7 +5174,7 @@ static void vmap_init_nodes(void)
* set of cores. Therefore a per-domain purging is supposed to
* be added as well as a per-domain balancing.
*/
- n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
@@ -5123,8 +5189,7 @@ static void vmap_init_nodes(void)
}
#endif
- for (n = 0; n < nr_vmap_nodes; n++) {
- vn = &vmap_nodes[n];
+ for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
@@ -5145,15 +5210,13 @@ static void vmap_init_nodes(void)
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long count;
+ unsigned long count = 0;
struct vmap_node *vn;
- int i, j;
-
- for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ int i;
- for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
- count += READ_ONCE(vn->pool[j].len);
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
@@ -5162,10 +5225,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int i;
+ struct vmap_node *vn;
- for (i = 0; i < nr_vmap_nodes; i++)
- decay_va_pool_node(&vmap_nodes[i], true);
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
return SHRINK_STOP;
}
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index bd5183dfd879..c197ed47bcc4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -316,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* asserted for a second in which subsequent
* pressure events can occur.
*/
- WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
+ mem_cgroup_set_socket_pressure(memcg);
}
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c22175120f5d..674999999cd0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/mmu_notifier.h>
+#include <linux/parser.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -93,10 +94,8 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
-#ifdef CONFIG_MEMCG
/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
int *proactive_swappiness;
-#endif
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
@@ -120,7 +119,7 @@ struct scan_control {
/* Has cache_trim_mode failed at least once? */
unsigned int cache_trim_mode_failed:1;
- /* Proactive reclaim invoked by userspace through memory.reclaim */
+ /* Proactive reclaim invoked by userspace */
unsigned int proactive:1;
/*
@@ -271,6 +270,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -323,16 +341,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -357,7 +381,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -396,13 +420,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
unsigned long size = 0;
int zid;
+ struct zone *zone;
- for (zid = 0; zid <= zone_idx; zid++) {
- struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
@@ -441,21 +461,26 @@ void drop_slab(void)
} while ((freed >> shift++) > 1);
}
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type) \
+ do { \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGSCAN_##type - PGSCAN_KSWAPD); \
+ } while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
{
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+ CHECK_RECLAIMER_OFFSET(DIRECT);
+ CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+ CHECK_RECLAIMER_OFFSET(PROACTIVE);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ if (sc->proactive)
+ return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
@@ -495,7 +520,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
-
+ struct zone *zone;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
@@ -508,12 +533,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
@@ -631,23 +651,53 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;
+static pageout_t writeout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug, struct list_head *folio_list)
+{
+ int res;
+
+ folio_set_reclaim(folio);
+
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
+ * or we failed to allocate contiguous swap entries, in which case
+ * the split out folios get added back to folio_list.
+ */
+ if (shmem_mapping(mapping))
+ res = shmem_writeout(folio, plug, folio_list);
+ else
+ res = swap_writeout(folio, plug);
+
+ if (res < 0)
+ handle_write_error(mapping, folio, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ folio_clear_reclaim(folio);
+ return PAGE_ACTIVATE;
+ }
+
+ /* synchronous write? */
+ if (!folio_test_writeback(folio))
+ folio_clear_reclaim(folio);
+
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+}
+
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -670,47 +720,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
- return PAGE_ACTIVATE;
-
- if (folio_clear_dirty_for_io(folio)) {
- int res;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = SWAP_CLUSTER_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1,
- .swap_plug = plug,
- };
-
- /*
- * The large shmem folio can be split if CONFIG_THP_SWAP is
- * not enabled or contiguous swap entries are failed to
- * allocate.
- */
- if (shmem_mapping(mapping) && folio_test_large(folio))
- wbc.list = folio_list;
-
- folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
- if (res < 0)
- handle_write_error(mapping, folio, res);
- if (res == AOP_WRITEPAGE_ACTIVATE) {
- folio_clear_reclaim(folio);
- return PAGE_ACTIVATE;
- }
- if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
- folio_clear_reclaim(folio);
- }
- trace_mm_vmscan_write_folio(folio);
- node_stat_add_folio(folio, NR_VMSCAN_WRITE);
- return PAGE_SUCCESS;
- }
-
- return PAGE_CLEAN;
+ if (!shmem_mapping(mapping) && !folio_test_anon(folio))
+ return PAGE_ACTIVATE;
+ if (!folio_clear_dirty_for_io(folio))
+ return PAGE_CLEAN;
+ return writeout(folio, mapping, plug, folio_list);
}
/*
@@ -769,7 +784,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
- mem_cgroup_swapout(folio, swap);
+ memcg1_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -862,15 +877,39 @@ enum folio_references {
FOLIOREF_ACTIVATE,
};
+#ifdef CONFIG_LRU_GEN
+/*
+ * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
+ * needs to be done by taking the folio off the LRU list and then adding it back
+ * with PG_active set. In contrast, the aging (page table walk) path uses
+ * folio_update_gen().
+ */
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return false;
+ }
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ return true;
+}
+#else
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
- referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
@@ -888,6 +927,15 @@ static enum folio_references folio_check_references(struct folio *folio,
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
+ if (lru_gen_enabled()) {
+ if (!referenced_ptes)
+ return FOLIOREF_RECLAIM;
+
+ return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
+ }
+
+ referenced_folio = folio_test_clear_referenced(folio);
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -957,7 +1005,8 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
{
struct folio *dst;
nodemask_t *allowed_mask;
@@ -1020,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_folios, alloc_migrate_folio, NULL,
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -1048,7 +1097,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
@@ -1061,7 +1111,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1079,6 +1129,21 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1092,11 +1157,6 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- /* folio_update_gen() tried to promote this page? */
- if (lru_gen_enabled() && !ignore_references &&
- folio_mapped(folio) && folio_test_referenced(folio))
- goto keep_locked;
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1137,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1163,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1173,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1251,7 +1317,7 @@ retry:
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1267,9 +1333,21 @@ retry:
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (!add_to_swap(folio))
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
}
@@ -1580,9 +1658,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int noreclaim_flag;
list_for_each_entry_safe(folio, next, folio_list, lru) {
+ /* TODO: these pages should not even appear in this list. */
+ if (page_has_movable_ops(&folio->page))
+ continue;
if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
- !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
- !folio_test_unevictable(folio)) {
+ !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
@@ -1596,7 +1676,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1663,12 +1743,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
+ unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -1679,9 +1758,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ /* Using max_nr_skipped to prevent hard LOCKUP*/
+ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
+ (folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
+ max_nr_skipped++;
goto move;
}
@@ -1954,10 +2036,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -1965,22 +2047,23 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+ nr_scanned - nr_reclaimed);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2046,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
{
unsigned long nr_taken;
unsigned long nr_scanned;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
@@ -2066,7 +2149,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2123,13 +2206,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
- if (nr_rotated)
- lru_note_cost(lruvec, file, 0, nr_rotated);
+ lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2148,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2340,17 +2421,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
+ struct zone *zone;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
total_high_wmark += high_wmark_pages(zone);
}
@@ -2368,6 +2445,106 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
}
}
+static inline void calculate_pressure_balance(struct scan_control *sc,
+ int swappiness, u64 *fraction, u64 *denominator)
+{
+ unsigned long anon_cost, file_cost, total_cost;
+ unsigned long ap, fp;
+
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[WORKINGSET_ANON] = ap;
+ fraction[WORKINGSET_FILE] = fp;
+ *denominator = ap + fp;
+}
+
+static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long scan)
+{
+ unsigned long min, low;
+
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan -= scan * protection / (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ }
+ return scan;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2380,12 +2557,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- unsigned long anon_cost, file_cost, total_cost;
int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
- unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */
@@ -2406,6 +2581,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2426,7 +2608,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2434,103 +2617,16 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
- /*
- * Calculate the pressure balance between anon and file pages.
- *
- * The amount of pressure we put on each LRU is inversely
- * proportional to the cost of reclaiming each list, as
- * determined by the share of pages that are refaulting, times
- * the relative IO cost of bringing back a swapped out
- * anonymous page vs reloading a filesystem page (swappiness).
- *
- * Although we limit that influence to ensure no list gets
- * left behind completely: at least a third of the pressure is
- * applied, before swappiness.
- *
- * With swappiness at 100, anon and file have equal IO cost.
- */
- total_cost = sc->anon_cost + sc->file_cost;
- anon_cost = total_cost + sc->anon_cost;
- file_cost = total_cost + sc->file_cost;
- total_cost = anon_cost + file_cost;
-
- ap = swappiness * (total_cost + 1);
- ap /= anon_cost + 1;
+ calculate_pressure_balance(sc, swappiness, fraction, &denominator);
- fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
- fp /= file_cost + 1;
-
- fraction[0] = ap;
- fraction[1] = fp;
- denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
unsigned long lruvec_size;
- unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- mem_cgroup_protection(sc->target_mem_cgroup, memcg,
- &min, &low);
-
- if (min || low) {
- /*
- * Scale a cgroup's reclaim pressure by proportioning
- * its current usage to its memory.low or memory.min
- * setting.
- *
- * This is important, as otherwise scanning aggression
- * becomes extremely binary -- from nothing as we
- * approach the memory protection threshold, to totally
- * nominal as we exceed it. This results in requiring
- * setting extremely liberal protection thresholds. It
- * also means we simply get no protection at all if we
- * set it too low, which is not ideal.
- *
- * If there is any protection in place, we reduce scan
- * pressure by how much of the total memory used is
- * within protection thresholds.
- *
- * There is one special case: in the first reclaim pass,
- * we skip over all groups that are within their low
- * protection. If that fails to reclaim enough pages to
- * satisfy the reclaim goal, we come back and override
- * the best-effort low protection. However, we still
- * ideally want to honor how well-behaved groups are in
- * that case instead of simply punishing them all
- * equally. As such, we reclaim them based on how much
- * memory they are using, reducing the scan pressure
- * again by how much of the total memory used is under
- * hard protection.
- */
- unsigned long cgroup_size = mem_cgroup_size(memcg);
- unsigned long protection;
-
- /* memory.low scaling, make sure we retry before OOM */
- if (!sc->memcg_low_reclaim && low > min) {
- protection = low;
- sc->memcg_low_skipped = 1;
- } else {
- protection = min;
- }
-
- /* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
-
- scan = lruvec_size - lruvec_size * protection /
- (cgroup_size + 1);
-
- /*
- * Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decrementing
- * sc->priority further than desirable.
- */
- scan = max(scan, SWAP_CLUSTER_MAX);
- } else {
- scan = lruvec_size;
- }
-
+ scan = apply_proportional_protection(memcg, sc, lruvec_size);
scan >>= sc->priority;
/*
@@ -2576,7 +2672,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2584,7 +2680,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2620,11 +2717,21 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
+#define evictable_min_seq(min_seq, swappiness) \
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
+
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define for_each_evictable_type(type, swappiness) \
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
+
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2656,7 +2763,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -2670,10 +2777,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_folio */
- return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ int n = get_nr_gens(lruvec, type);
+
+ if (n < MIN_NR_GENS || n > MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
}
/******************************************************************************
@@ -3074,16 +3187,20 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
+ int i;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- pos->refaulted = lrugen->avg_refaulted[type][tier] +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- pos->total = lrugen->avg_total[type][tier] +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
+ pos->refaulted = pos->total = 0;
+
+ for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ pos->refaulted += lrugen->avg_refaulted[type][i] +
+ atomic_long_read(&lrugen->refaulted[hist][type][i]);
+ pos->total += lrugen->avg_total[type][i] +
+ lrugen->protected[hist][type][i] +
+ atomic_long_read(&lrugen->evicted[hist][type][i]);
+ }
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
@@ -3109,17 +3226,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
+ lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- if (tier)
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
}
}
}
@@ -3146,16 +3261,19 @@ static int folio_update_gen(struct folio *folio, int gen)
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return -1;
+ }
+
do {
/* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
+ if (!(old_flags & LRU_GEN_MASK))
+ return -1;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
+ new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
@@ -3179,7 +3297,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_gen = (old_gen + 1) % MAX_NR_GENS;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
/* for folio_end_writeback() */
if (reclaiming)
@@ -3254,7 +3372,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (vma_is_anonymous(vma))
- return !walk->can_swap;
+ return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
return true;
@@ -3264,7 +3382,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (shmem_mapping(mapping))
- return !walk->can_swap;
+ return !walk->swappiness;
+
+ if (walk->swappiness > MAX_SWAPPINESS)
+ return true;
/* to exclude special mappings like dax, etc. */
return !mapping->a_ops->read_folio;
@@ -3311,7 +3432,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (!pte_present(pte) || is_zero_pfn(pfn))
return -1;
- if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ if (WARN_ON_ONCE(pte_special(pte)))
return -1;
if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
@@ -3336,9 +3457,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
return -1;
- if (WARN_ON_ONCE(pmd_devmap(pmd)))
- return -1;
-
if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
return -1;
@@ -3352,19 +3470,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
}
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
+ struct pglist_data *pgdat)
{
- struct folio *folio;
+ struct folio *folio = pfn_folio(pfn);
- folio = pfn_folio(pfn);
- if (folio_nid(folio) != pgdat->node_id)
+ if (folio_lru_gen(folio) < 0)
return NULL;
- if (folio_memcg(folio) != memcg)
+ if (folio_nid(folio) != pgdat->node_id)
return NULL;
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
+ if (folio_memcg(folio) != memcg)
return NULL;
return folio;
@@ -3378,29 +3494,55 @@ static bool suitable_to_scan(int total, int young)
return young * n >= total;
}
+static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int new_gen, bool dirty)
+{
+ int old_gen;
+
+ if (!folio)
+ return;
+
+ if (dirty && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ } else if (lru_gen_set_refs(folio)) {
+ old_gen = folio_lru_gen(folio);
+ if (old_gen >= 0 && old_gen != new_gen)
+ folio_activate(folio);
+ }
+}
+
static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
+ bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
int total = 0;
int young = 0;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
pmd_t pmdval;
- pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
- &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
if (!pte)
return false;
+
if (!spin_trylock(ptl)) {
pte_unmap(pte);
- return false;
+ return true;
}
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
@@ -3422,26 +3564,30 @@ restart:
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
continue;
- young++;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
+
+ last = folio;
+ dirty = false;
+ }
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (pte_dirty(ptent))
+ dirty = true;
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
}
+ walk_update_folio(walk, last, gen, dirty);
+ last = NULL;
+
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
@@ -3455,13 +3601,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
+ bool dirty;
pmd_t *pmd;
spinlock_t *ptl;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3507,27 +3655,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (pfn == -1)
goto next;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
goto next;
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pmd_dirty(pmd[i]))
+ dirty = true;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
@@ -3719,22 +3870,30 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
int zone;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type == LRU_GEN_ANON && !can_swap)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
- /* prevent cold/hot inversion if force_scan is true */
+ /* prevent cold/hot inversion if the type is evictable */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -3744,6 +3903,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
new_gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int tier = lru_tier_from_refs(refs, workingset);
+ int delta = folio_nr_pages(folio);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
+
if (!--remaining)
return false;
}
@@ -3755,17 +3923,18 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
+ bool seq_inc_flag = false;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
/* find the oldest populated generation */
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
@@ -3775,19 +3944,32 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
}
min_seq[type]++;
+ seq_inc_flag = true;
}
next:
;
}
+ /*
+ * If min_seq[type] of both anonymous and file is not increased,
+ * we can directly return false to avoid unnecessary checking
+ * overhead later.
+ */
+ if (!seq_inc_flag)
+ return success;
+
/* see the comment on lru_gen_folio */
- if (can_swap) {
- min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ if (swappiness && swappiness <= MAX_SWAPPINESS) {
+ unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
+
+ if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
+ min_seq[LRU_GEN_ANON] = seq;
+ else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
+ min_seq[LRU_GEN_FILE] = seq;
}
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
- if (min_seq[type] == lrugen->min_seq[type])
+ for_each_evictable_type(type, swappiness) {
+ if (min_seq[type] <= lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
@@ -3798,8 +3980,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
bool success;
int prev, next;
@@ -3817,13 +3998,11 @@ restart:
if (!success)
goto unlock;
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
-
- if (inc_min_seq(lruvec, type, can_swap))
+ if (inc_min_seq(lruvec, type, swappiness))
continue;
spin_unlock_irq(&lruvec->lru_lock);
@@ -3867,7 +4046,7 @@ unlock:
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3878,7 +4057,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */
if (seq <= READ_ONCE(mm_state->seq))
@@ -3903,7 +4082,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
walk->lruvec = lruvec;
walk->seq = seq;
- walk->can_swap = can_swap;
+ walk->swappiness = swappiness;
walk->force_scan = force_scan;
do {
@@ -3913,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
@@ -3954,13 +4133,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
unsigned long total = 0;
- bool can_swap = get_swappiness(lruvec, sc);
+ int swappiness = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
@@ -3980,6 +4159,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
{
int gen;
unsigned long birth;
+ int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
@@ -3989,8 +4169,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
if (!lruvec_is_sizable(lruvec, sc))
return false;
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return time_is_before_jiffies(birth + min_ttl);
@@ -4049,21 +4228,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
+ bool dirty;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
+ struct folio *last = NULL;
int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4110,37 +4290,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(vma, addr, pte + i))
continue;
- young++;
-
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
-
- continue;
+ last = folio;
+ dirty = false;
}
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen) {
- folio_clear_lru_refs(folio);
- folio_activate(folio);
- }
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
}
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
/* feedback from rmap walkers to page table walkers */
@@ -4298,7 +4469,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4320,19 +4492,22 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-
+ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- lrugen->protected[hist][type][tier - 1] + delta);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
return true;
}
/* ineligible */
- if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
+ if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4347,8 +4522,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* waiting for writeback */
- if (folio_test_locked(folio) || writeback ||
- (type == LRU_GEN_FILE && dirty)) {
+ if (writeback || (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4377,13 +4551,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return false;
}
- /* see the comment on MAX_NR_TIERS */
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- folio_clear_lru_refs(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
- folio_clear_referenced(folio);
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4391,8 +4564,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return true;
}
-static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- int type, int tier, struct list_head *list)
+static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int type, int tier,
+ struct list_head *list)
{
int i;
int gen;
@@ -4401,7 +4575,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int remaining = MAX_LRU_BATCH;
+ int remaining = min(nr_to_scan, MAX_LRU_BATCH);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -4453,13 +4627,13 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4479,13 +4653,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
struct ctrl_pos sp, pv;
/*
- * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * To leave a margin for fluctuations, use a larger gain factor (2:3).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
- read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ read_ctrl_pos(lruvec, type, 0, 2, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ read_ctrl_pos(lruvec, type, tier, 3, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
@@ -4493,82 +4667,50 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
return tier - 1;
}
-static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
+ if (swappiness <= MIN_SWAPPINESS + 1)
+ return LRU_GEN_FILE;
+
+ if (swappiness >= MAX_SWAPPINESS)
+ return LRU_GEN_ANON;
/*
- * Compare the first tier of anon with that of file to determine which
- * type to scan. Also need to compare other tiers of the selected type
- * with the first tier of the other type to determine the last tier (of
- * the selected type) to evict.
+ * Compare the sum of all tiers of anon with that of file to determine
+ * which type to scan.
*/
- read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- type = positive_ctrl_err(&sp, &pv);
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
- read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- if (!positive_ctrl_err(&sp, &pv))
- break;
- }
-
- *tier_idx = tier - 1;
-
- return type;
+ return positive_ctrl_err(&sp, &pv);
}
-static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
- int type;
- int scanned;
- int tier = -1;
- DEFINE_MIN_SEQ(lruvec);
+ int type = get_type_to_scan(lruvec, swappiness);
- /*
- * Try to make the obvious choice first, and if anon and file are both
- * available from the same generation,
- * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
- * first.
- * 2. If !__GFP_IO, file first since clean pagecache is more likely to
- * exist than clean swapcache.
- */
- if (!swappiness)
- type = LRU_GEN_FILE;
- else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- type = LRU_GEN_ANON;
- else if (swappiness == 1)
- type = LRU_GEN_FILE;
- else if (swappiness == MAX_SWAPPINESS)
- type = LRU_GEN_ANON;
- else if (!(sc->gfp_mask & __GFP_IO))
- type = LRU_GEN_FILE;
- else
- type = get_type_to_scan(lruvec, swappiness, &tier);
+ for_each_evictable_type(i, swappiness) {
+ int scanned;
+ int tier = get_tier_idx(lruvec, type);
- for (i = !swappiness; i < ANON_AND_FILE; i++) {
- if (tier < 0)
- tier = get_tier_idx(lruvec, type);
+ *type_scanned = type;
- scanned = scan_folios(lruvec, sc, type, tier, list);
+ scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
if (scanned)
- break;
+ return scanned;
type = !type;
- tier = -1;
}
- *type_scanned = type;
-
- return scanned;
+ return 0;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -4581,16 +4723,17 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
bool skip_retry = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
spin_lock_irq(&lruvec->lru_lock);
- scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
scanned += try_to_inc_min_seq(lruvec, swappiness);
- if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
@@ -4598,7 +4741,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4606,31 +4749,24 @@ retry:
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ DEFINE_MIN_SEQ(lruvec);
+
if (!folio_evictable(folio)) {
list_del(&folio->lru);
folio_putback_lru(folio);
continue;
}
- if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
- continue;
- }
-
- if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
- folio_mapped(folio) || folio_test_locked(folio) ||
- folio_test_dirty(folio) || folio_test_writeback(folio)) {
- /* don't add rejected folios to the oldest generation */
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
- BIT(PG_active));
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, &clean);
continue;
}
- /* retry folios that may have missed folio_rotate_reclaimable() */
- list_move(&folio->lru, &clean);
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4643,13 +4779,13 @@ retry:
reset_batch_size(walk);
}
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4665,63 +4801,32 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- bool can_swap, unsigned long *nr_to_scan)
+ int swappiness, unsigned long *nr_to_scan)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
- unsigned long total = 0;
+ unsigned long size = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- /* whether this lruvec is completely out of cold folios */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
- *nr_to_scan = 0;
+ *nr_to_scan = 0;
+ /* have to run aging, since eviction is not possible anymore */
+ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- }
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
}
}
- *nr_to_scan = total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ *nr_to_scan = size;
+ /* better to run aging even though eviction is still possible */
+ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/*
@@ -4729,7 +4834,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
bool success;
unsigned long nr_to_scan;
@@ -4739,18 +4844,20 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
+ success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
/* try to scrape all its memory if this memcg was deleted */
if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
+ nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+
/* try to get away with not aging at the default priority */
if (!success || sc->priority == DEF_PRIORITY)
return nr_to_scan >> sc->priority;
/* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -4797,7 +4904,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (nr_to_scan <= 0)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
if (!delta)
break;
@@ -5294,8 +5401,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
}
for (i = 0; i < 3; i++)
@@ -5329,7 +5435,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
- bool full = !debugfs_real_fops(m->file)->write;
+ bool full = debugfs_get_aux_num(m->file);
struct lruvec *lruvec = v;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
@@ -5350,7 +5456,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, " node %5d\n", nid);
if (!full)
- seq = min_seq[LRU_GEN_ANON];
+ seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
else if (max_seq >= MAX_NR_GENS)
seq = max_seq - MAX_NR_GENS + 1;
else
@@ -5390,23 +5496,14 @@ static const struct seq_operations lru_gen_seq_ops = {
};
static int run_aging(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
-
- if (seq < max_seq)
- return 0;
if (seq > max_seq)
return -EINVAL;
- if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
- return -ERANGE;
-
- try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
-
- return 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
@@ -5422,13 +5519,14 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
while (!signal_pending(current)) {
DEFINE_MIN_SEQ(lruvec);
- if (seq < min_seq[!swappiness])
+ if (seq < evictable_min_seq(min_seq, swappiness))
return 0;
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness))
+ if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
+ swappiness))
return 0;
cond_resched();
@@ -5467,7 +5565,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5524,24 +5622,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5663,8 +5772,10 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
- debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
- debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+ debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false,
+ &lru_gen_rw_fops);
+ debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true,
+ &lru_gen_ro_fops);
return 0;
};
@@ -5801,7 +5912,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -5832,6 +5943,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
+ struct zone *zone;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -5851,17 +5963,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
- for (z = 0; z <= sc->reclaim_idx; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+ unsigned long watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, watermark,
+ sc->reclaim_idx))
return false;
}
@@ -6088,22 +6199,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
sc->reclaim_idx, 0))
return true;
- /* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
- return false;
-
/*
- * Compaction is already possible, but it takes time to run and there
- * are potentially other callers using the pages just freed. So proceed
- * with reclaim to make a buffer of free pages available to give
- * compaction a reasonable chance of completing and allocating the page.
+ * Direct reclaim usually targets the min watermark, but compaction
+ * takes time to run and there are potentially other callers using the
+ * pages just freed. So target a higher buffer to give compaction a
+ * reasonable chance of completing and allocating the pages.
+ *
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
- watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+ watermark = high_wmark_pages(zone);
+ if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+ return true;
- return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+ return false;
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
@@ -6382,11 +6492,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= ZONE_NORMAL; i++) {
- zone = &pgdat->node_zones[i];
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
if (!zone_reclaimable_pages(zone))
continue;
@@ -6625,6 +6731,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
return nr_reclaimed;
}
+#else
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ unsigned int reclaim_options,
+ int *swappiness)
+{
+ return 0;
+}
#endif
static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -6637,10 +6752,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6691,17 +6806,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
+ unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+ /*
+ * In defrag_mode, watermarks must be met in whole
+ * blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
+ */
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
+ else
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
+
+ if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+ 0, free_pages))
return true;
}
@@ -6781,11 +6927,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
- for (z = 0; z <= sc->reclaim_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
@@ -6816,12 +6958,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
int i;
struct zone *zone;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
@@ -6882,11 +7019,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
@@ -7193,10 +7326,6 @@ static int kswapd(void *p)
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
@@ -7365,13 +7494,15 @@ void __meminit kswapd_run(int nid)
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
+ } else {
+ wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);
@@ -7395,6 +7526,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7402,6 +7555,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
@@ -7480,36 +7634,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
/*
* Try to free up some pages from this node through reclaim.
*/
-static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
+ unsigned long nr_pages,
+ struct scan_control *sc)
{
- /* Minimum pages needed in order to stay on node */
- const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
unsigned int noreclaim_flag;
- struct scan_control sc = {
- .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = current_gfp_context(gfp_mask),
- .order = order,
- .priority = NODE_RECLAIM_PRIORITY,
- .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
- .may_swap = 1,
- .reclaim_idx = gfp_zone(gfp_mask),
- };
unsigned long pflags;
- trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
- sc.gfp_mask);
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order,
+ sc->gfp_mask);
cond_resched();
psi_memstall_enter(&pflags);
delayacct_freepages_start();
- fs_reclaim_acquire(sc.gfp_mask);
+ fs_reclaim_acquire(sc->gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
*/
noreclaim_flag = memalloc_noreclaim_save();
- set_task_reclaim_state(p, &sc.reclaim_state);
+ set_task_reclaim_state(p, &sc->reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
@@ -7518,24 +7662,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* priorities until we have enough memory freed.
*/
do {
- shrink_node(pgdat, &sc);
- } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
+ shrink_node(pgdat, sc);
+ } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
}
set_task_reclaim_state(p, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
- fs_reclaim_release(sc.gfp_mask);
- psi_memstall_leave(&pflags);
+ fs_reclaim_release(sc->gfp_mask);
delayacct_freepages_end();
+ psi_memstall_leave(&pflags);
- trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed);
- return sc.nr_reclaimed >= nr_pages;
+ return sc->nr_reclaimed;
}
int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
int ret;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .order = order,
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
+ .may_swap = 1,
+ .reclaim_idx = gfp_zone(gfp_mask),
+ };
/*
* Node reclaim reclaims unmapped file backed pages and
@@ -7567,11 +7723,11 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
- ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
@@ -7580,6 +7736,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
return ret;
}
+
+enum {
+ MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
+ MEMORY_RECLAIM_NULL,
+};
+static const match_table_t tokens = {
+ { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int swappiness = -1;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ switch (match_token(start, tokens, args)) {
+ case MEMORY_RECLAIM_SWAPPINESS:
+ if (match_int(&args[0], &swappiness))
+ return -EINVAL;
+ if (swappiness < MIN_SWAPPINESS ||
+ swappiness > MAX_SWAPPINESS)
+ return -EINVAL;
+ break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages.
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ if (memcg) {
+ unsigned int reclaim_options;
+
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ batch_size, gfp_mask,
+ reclaim_options,
+ swappiness == -1 ? NULL : &swappiness);
+ } else {
+ struct scan_control sc = {
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .reclaim_idx = gfp_zone(gfp_mask),
+ .proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
+ .may_unmap = 1,
+ .may_swap = 1,
+ .proactive = 1,
+ };
+
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
+ &pgdat->flags))
+ return -EBUSY;
+
+ reclaimed = __node_reclaim(pgdat, gfp_mask,
+ batch_size, &sc);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ }
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return 0;
+}
+
#endif
/**
@@ -7627,3 +7891,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, nid = dev->id;
+
+ ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
+ return ret ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16bfe1c694dd..71cd1ceba191 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,7 +7,7 @@
*
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
- * Christoph Lameter <christoph@lameter.com>
+ * Christoph Lameter <cl@gentwo.org>
* Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
@@ -31,8 +31,10 @@
#include "internal.h"
+#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NUMA
-int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+#define ENABLE_NUMA_STAT 1
+static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
/* zero numa counters within a zone */
static void zero_zone_numa_counters(struct zone *zone)
@@ -74,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
-int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
+static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -102,6 +104,7 @@ out:
return ret;
}
#endif
+#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1160,313 +1163,339 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
+#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma",
#else
-#define TEXT_FOR_DMA(xx)
+#define TEXT_FOR_DMA(xx, yy)
#endif
#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32",
#else
-#define TEXT_FOR_DMA32(xx)
+#define TEXT_FOR_DMA32(xx, yy)
#endif
#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high",
#else
-#define TEXT_FOR_HIGHMEM(xx)
+#define TEXT_FOR_HIGHMEM(xx, yy)
#endif
#ifdef CONFIG_ZONE_DEVICE
-#define TEXT_FOR_DEVICE(xx) xx "_device",
+#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device",
#else
-#define TEXT_FOR_DEVICE(xx)
+#define TEXT_FOR_DEVICE(xx, yy)
#endif
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable", \
- TEXT_FOR_DEVICE(xx)
+#define TEXTS_FOR_ZONES(xx, yy) \
+ TEXT_FOR_DMA(xx, yy) \
+ TEXT_FOR_DMA32(xx, yy) \
+ [xx##_NORMAL] = yy "_normal", \
+ TEXT_FOR_HIGHMEM(xx, yy) \
+ [xx##_MOVABLE] = yy "_movable", \
+ TEXT_FOR_DEVICE(xx, yy)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
- "nr_free_pages",
- "nr_zone_inactive_anon",
- "nr_zone_active_anon",
- "nr_zone_inactive_file",
- "nr_zone_active_file",
- "nr_zone_unevictable",
- "nr_zone_write_pending",
- "nr_mlock",
- "nr_bounce",
+#define I(x) (x)
+ [I(NR_FREE_PAGES)] = "nr_free_pages",
+ [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks",
+ [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon",
+ [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon",
+ [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file",
+ [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file",
+ [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable",
+ [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending",
+ [I(NR_MLOCK)] = "nr_mlock",
#if IS_ENABLED(CONFIG_ZSMALLOC)
- "nr_zspages",
+ [I(NR_ZSPAGES)] = "nr_zspages",
#endif
- "nr_free_cma",
+ [I(NR_FREE_CMA_PAGES)] = "nr_free_cma",
#ifdef CONFIG_UNACCEPTED_MEMORY
- "nr_unaccepted",
+ [I(NR_UNACCEPTED)] = "nr_unaccepted",
#endif
+#undef I
/* enum numa_stat_item counters */
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + x)
#ifdef CONFIG_NUMA
- "numa_hit",
- "numa_miss",
- "numa_foreign",
- "numa_interleave",
- "numa_local",
- "numa_other",
+ [I(NUMA_HIT)] = "numa_hit",
+ [I(NUMA_MISS)] = "numa_miss",
+ [I(NUMA_FOREIGN)] = "numa_foreign",
+ [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave",
+ [I(NUMA_LOCAL)] = "numa_local",
+ [I(NUMA_OTHER)] = "numa_other",
#endif
+#undef I
/* enum node_stat_item counters */
- "nr_inactive_anon",
- "nr_active_anon",
- "nr_inactive_file",
- "nr_active_file",
- "nr_unevictable",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
- "nr_isolated_anon",
- "nr_isolated_file",
- "workingset_nodes",
- "workingset_refault_anon",
- "workingset_refault_file",
- "workingset_activate_anon",
- "workingset_activate_file",
- "workingset_restore_anon",
- "workingset_restore_file",
- "workingset_nodereclaim",
- "nr_anon_pages",
- "nr_mapped",
- "nr_file_pages",
- "nr_dirty",
- "nr_writeback",
- "nr_writeback_temp",
- "nr_shmem",
- "nr_shmem_hugepages",
- "nr_shmem_pmdmapped",
- "nr_file_hugepages",
- "nr_file_pmdmapped",
- "nr_anon_transparent_hugepages",
- "nr_vmscan_write",
- "nr_vmscan_immediate_reclaim",
- "nr_dirtied",
- "nr_written",
- "nr_throttled_written",
- "nr_kernel_misc_reclaimable",
- "nr_foll_pin_acquired",
- "nr_foll_pin_released",
- "nr_kernel_stack",
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x)
+ [I(NR_INACTIVE_ANON)] = "nr_inactive_anon",
+ [I(NR_ACTIVE_ANON)] = "nr_active_anon",
+ [I(NR_INACTIVE_FILE)] = "nr_inactive_file",
+ [I(NR_ACTIVE_FILE)] = "nr_active_file",
+ [I(NR_UNEVICTABLE)] = "nr_unevictable",
+ [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable",
+ [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable",
+ [I(NR_ISOLATED_ANON)] = "nr_isolated_anon",
+ [I(NR_ISOLATED_FILE)] = "nr_isolated_file",
+ [I(WORKINGSET_NODES)] = "workingset_nodes",
+ [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon",
+ [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file",
+ [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon",
+ [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file",
+ [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon",
+ [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file",
+ [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim",
+ [I(NR_ANON_MAPPED)] = "nr_anon_pages",
+ [I(NR_FILE_MAPPED)] = "nr_mapped",
+ [I(NR_FILE_PAGES)] = "nr_file_pages",
+ [I(NR_FILE_DIRTY)] = "nr_dirty",
+ [I(NR_WRITEBACK)] = "nr_writeback",
+ [I(NR_SHMEM)] = "nr_shmem",
+ [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages",
+ [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped",
+ [I(NR_FILE_THPS)] = "nr_file_hugepages",
+ [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped",
+ [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages",
+ [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write",
+ [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim",
+ [I(NR_DIRTIED)] = "nr_dirtied",
+ [I(NR_WRITTEN)] = "nr_written",
+ [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written",
+ [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable",
+ [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired",
+ [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released",
+ [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack",
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- "nr_shadow_call_stack",
+ [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack",
#endif
- "nr_page_table_pages",
- "nr_sec_page_table_pages",
+ [I(NR_PAGETABLE)] = "nr_page_table_pages",
+ [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages",
#ifdef CONFIG_IOMMU_SUPPORT
- "nr_iommu_pages",
+ [I(NR_IOMMU_PAGES)] = "nr_iommu_pages",
#endif
#ifdef CONFIG_SWAP
- "nr_swapcached",
+ [I(NR_SWAPCACHE)] = "nr_swapcached",
#endif
#ifdef CONFIG_NUMA_BALANCING
- "pgpromote_success",
- "pgpromote_candidate",
+ [I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
+ [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
#endif
- "pgdemote_kswapd",
- "pgdemote_direct",
- "pgdemote_khugepaged",
+ [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
+ [I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
+ [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged",
+ [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive",
#ifdef CONFIG_HUGETLB_PAGE
- "nr_hugetlb",
+ [I(NR_HUGETLB)] = "nr_hugetlb",
#endif
- /* system-wide enum vm_stat_item counters */
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
- "nr_memmap_pages",
- "nr_memmap_boot_pages",
+ [I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
+#undef I
-#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+ /* system-wide enum vm_stat_item counters */
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + x)
+ [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold",
+ [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold",
+ [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages",
+ [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages",
+#undef I
+
+#if defined(CONFIG_VM_EVENT_COUNTERS)
/* enum vm_event_item counters */
- "pgpgin",
- "pgpgout",
- "pswpin",
- "pswpout",
-
- TEXTS_FOR_ZONES("pgalloc")
- TEXTS_FOR_ZONES("allocstall")
- TEXTS_FOR_ZONES("pgskip")
-
- "pgfree",
- "pgactivate",
- "pgdeactivate",
- "pglazyfree",
-
- "pgfault",
- "pgmajfault",
- "pglazyfreed",
-
- "pgrefill",
- "pgreuse",
- "pgsteal_kswapd",
- "pgsteal_direct",
- "pgsteal_khugepaged",
- "pgscan_kswapd",
- "pgscan_direct",
- "pgscan_khugepaged",
- "pgscan_direct_throttle",
- "pgscan_anon",
- "pgscan_file",
- "pgsteal_anon",
- "pgsteal_file",
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x)
+
+ [I(PGPGIN)] = "pgpgin",
+ [I(PGPGOUT)] = "pgpgout",
+ [I(PSWPIN)] = "pswpin",
+ [I(PSWPOUT)] = "pswpout",
+
+#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS)
+ TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc")
+ TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall")
+ TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip")
+#undef OFF
+
+ [I(PGFREE)] = "pgfree",
+ [I(PGACTIVATE)] = "pgactivate",
+ [I(PGDEACTIVATE)] = "pgdeactivate",
+ [I(PGLAZYFREE)] = "pglazyfree",
+
+ [I(PGFAULT)] = "pgfault",
+ [I(PGMAJFAULT)] = "pgmajfault",
+ [I(PGLAZYFREED)] = "pglazyfreed",
+
+ [I(PGREFILL)] = "pgrefill",
+ [I(PGREUSE)] = "pgreuse",
+ [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd",
+ [I(PGSTEAL_DIRECT)] = "pgsteal_direct",
+ [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged",
+ [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive",
+ [I(PGSCAN_KSWAPD)] = "pgscan_kswapd",
+ [I(PGSCAN_DIRECT)] = "pgscan_direct",
+ [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged",
+ [I(PGSCAN_PROACTIVE)] = "pgscan_proactive",
+ [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle",
+ [I(PGSCAN_ANON)] = "pgscan_anon",
+ [I(PGSCAN_FILE)] = "pgscan_file",
+ [I(PGSTEAL_ANON)] = "pgsteal_anon",
+ [I(PGSTEAL_FILE)] = "pgsteal_file",
#ifdef CONFIG_NUMA
- "zone_reclaim_success",
- "zone_reclaim_failed",
+ [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success",
+ [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed",
#endif
- "pginodesteal",
- "slabs_scanned",
- "kswapd_inodesteal",
- "kswapd_low_wmark_hit_quickly",
- "kswapd_high_wmark_hit_quickly",
- "pageoutrun",
+ [I(PGINODESTEAL)] = "pginodesteal",
+ [I(SLABS_SCANNED)] = "slabs_scanned",
+ [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal",
+ [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly",
+ [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly",
+ [I(PAGEOUTRUN)] = "pageoutrun",
- "pgrotated",
+ [I(PGROTATED)] = "pgrotated",
- "drop_pagecache",
- "drop_slab",
- "oom_kill",
+ [I(DROP_PAGECACHE)] = "drop_pagecache",
+ [I(DROP_SLAB)] = "drop_slab",
+ [I(OOM_KILL)] = "oom_kill",
#ifdef CONFIG_NUMA_BALANCING
- "numa_pte_updates",
- "numa_huge_pte_updates",
- "numa_hint_faults",
- "numa_hint_faults_local",
- "numa_pages_migrated",
+ [I(NUMA_PTE_UPDATES)] = "numa_pte_updates",
+ [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates",
+ [I(NUMA_HINT_FAULTS)] = "numa_hint_faults",
+ [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local",
+ [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated",
#endif
#ifdef CONFIG_MIGRATION
- "pgmigrate_success",
- "pgmigrate_fail",
- "thp_migration_success",
- "thp_migration_fail",
- "thp_migration_split",
+ [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success",
+ [I(PGMIGRATE_FAIL)] = "pgmigrate_fail",
+ [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success",
+ [I(THP_MIGRATION_FAIL)] = "thp_migration_fail",
+ [I(THP_MIGRATION_SPLIT)] = "thp_migration_split",
#endif
#ifdef CONFIG_COMPACTION
- "compact_migrate_scanned",
- "compact_free_scanned",
- "compact_isolated",
- "compact_stall",
- "compact_fail",
- "compact_success",
- "compact_daemon_wake",
- "compact_daemon_migrate_scanned",
- "compact_daemon_free_scanned",
+ [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned",
+ [I(COMPACTFREE_SCANNED)] = "compact_free_scanned",
+ [I(COMPACTISOLATED)] = "compact_isolated",
+ [I(COMPACTSTALL)] = "compact_stall",
+ [I(COMPACTFAIL)] = "compact_fail",
+ [I(COMPACTSUCCESS)] = "compact_success",
+ [I(KCOMPACTD_WAKE)] = "compact_daemon_wake",
+ [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned",
+ [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned",
#endif
#ifdef CONFIG_HUGETLB_PAGE
- "htlb_buddy_alloc_success",
- "htlb_buddy_alloc_fail",
+ [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success",
+ [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail",
#endif
#ifdef CONFIG_CMA
- "cma_alloc_success",
- "cma_alloc_fail",
+ [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success",
+ [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail",
#endif
- "unevictable_pgs_culled",
- "unevictable_pgs_scanned",
- "unevictable_pgs_rescued",
- "unevictable_pgs_mlocked",
- "unevictable_pgs_munlocked",
- "unevictable_pgs_cleared",
- "unevictable_pgs_stranded",
+ [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled",
+ [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned",
+ [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued",
+ [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked",
+ [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked",
+ [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared",
+ [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded",
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- "thp_fault_alloc",
- "thp_fault_fallback",
- "thp_fault_fallback_charge",
- "thp_collapse_alloc",
- "thp_collapse_alloc_failed",
- "thp_file_alloc",
- "thp_file_fallback",
- "thp_file_fallback_charge",
- "thp_file_mapped",
- "thp_split_page",
- "thp_split_page_failed",
- "thp_deferred_split_page",
- "thp_underused_split_page",
- "thp_split_pmd",
- "thp_scan_exceed_none_pte",
- "thp_scan_exceed_swap_pte",
- "thp_scan_exceed_share_pte",
+ [I(THP_FAULT_ALLOC)] = "thp_fault_alloc",
+ [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback",
+ [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge",
+ [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc",
+ [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed",
+ [I(THP_FILE_ALLOC)] = "thp_file_alloc",
+ [I(THP_FILE_FALLBACK)] = "thp_file_fallback",
+ [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge",
+ [I(THP_FILE_MAPPED)] = "thp_file_mapped",
+ [I(THP_SPLIT_PAGE)] = "thp_split_page",
+ [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed",
+ [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page",
+ [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page",
+ [I(THP_SPLIT_PMD)] = "thp_split_pmd",
+ [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte",
+ [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte",
+ [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- "thp_split_pud",
+ [I(THP_SPLIT_PUD)] = "thp_split_pud",
#endif
- "thp_zero_page_alloc",
- "thp_zero_page_alloc_failed",
- "thp_swpout",
- "thp_swpout_fallback",
+ [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc",
+ [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed",
+ [I(THP_SWPOUT)] = "thp_swpout",
+ [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
#endif
#ifdef CONFIG_MEMORY_BALLOON
- "balloon_inflate",
- "balloon_deflate",
+ [I(BALLOON_INFLATE)] = "balloon_inflate",
+ [I(BALLOON_DEFLATE)] = "balloon_deflate",
#ifdef CONFIG_BALLOON_COMPACTION
- "balloon_migrate",
+ [I(BALLOON_MIGRATE)] = "balloon_migrate",
#endif
#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
- "nr_tlb_remote_flush",
- "nr_tlb_remote_flush_received",
- "nr_tlb_local_flush_all",
- "nr_tlb_local_flush_one",
+ [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush",
+ [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received",
+ [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all",
+ [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
#ifdef CONFIG_SWAP
- "swap_ra",
- "swap_ra_hit",
- "swpin_zero",
- "swpout_zero",
+ [I(SWAP_RA)] = "swap_ra",
+ [I(SWAP_RA_HIT)] = "swap_ra_hit",
+ [I(SWPIN_ZERO)] = "swpin_zero",
+ [I(SWPOUT_ZERO)] = "swpout_zero",
#ifdef CONFIG_KSM
- "ksm_swpin_copy",
+ [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy",
#endif
#endif
#ifdef CONFIG_KSM
- "cow_ksm",
+ [I(COW_KSM)] = "cow_ksm",
#endif
#ifdef CONFIG_ZSWAP
- "zswpin",
- "zswpout",
- "zswpwb",
+ [I(ZSWPIN)] = "zswpin",
+ [I(ZSWPOUT)] = "zswpout",
+ [I(ZSWPWB)] = "zswpwb",
#endif
#ifdef CONFIG_X86
- "direct_map_level2_splits",
- "direct_map_level3_splits",
+ [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits",
+ [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits",
+ [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses",
+ [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
- "vma_lock_success",
- "vma_lock_abort",
- "vma_lock_retry",
- "vma_lock_miss",
+ [I(VMA_LOCK_SUCCESS)] = "vma_lock_success",
+ [I(VMA_LOCK_ABORT)] = "vma_lock_abort",
+ [I(VMA_LOCK_RETRY)] = "vma_lock_retry",
+ [I(VMA_LOCK_MISS)] = "vma_lock_miss",
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- "kstack_1k",
+ [I(KSTACK_1K)] = "kstack_1k",
#if THREAD_SIZE > 1024
- "kstack_2k",
+ [I(KSTACK_2K)] = "kstack_2k",
#endif
#if THREAD_SIZE > 2048
- "kstack_4k",
+ [I(KSTACK_4K)] = "kstack_4k",
#endif
#if THREAD_SIZE > 4096
- "kstack_8k",
+ [I(KSTACK_8K)] = "kstack_8k",
#endif
#if THREAD_SIZE > 8192
- "kstack_16k",
+ [I(KSTACK_16K)] = "kstack_16k",
#endif
#if THREAD_SIZE > 16384
- "kstack_32k",
+ [I(KSTACK_32K)] = "kstack_32k",
#endif
#if THREAD_SIZE > 32768
- "kstack_64k",
+ [I(KSTACK_64K)] = "kstack_64k",
#endif
#if THREAD_SIZE > 65536
- "kstack_rest",
+ [I(KSTACK_REST)] = "kstack_rest",
#endif
#endif
-#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+#undef I
+#endif /* CONFIG_VM_EVENT_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1858,7 +1887,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
- BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS);
fold_vm_numa_events();
v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
@@ -1938,7 +1967,7 @@ static const struct seq_operations vmstat_op = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
+static int sysctl_stat_interval __read_mostly = HZ;
static int vmstat_late_init_done;
#ifdef CONFIG_PROC_FS
@@ -1947,7 +1976,7 @@ static void refresh_vm_stats(struct work_struct *work)
refresh_cpu_vm_stats(true);
}
-int vmstat_refresh(const struct ctl_table *table, int write,
+static int vmstat_refresh(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
@@ -2196,6 +2225,38 @@ static int __init vmstat_late_init(void)
late_initcall(vmstat_late_init);
#endif
+#ifdef CONFIG_PROC_FS
+static const struct ctl_table vmstat_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+};
+#endif
+
struct workqueue_struct *mm_percpu_wq;
void __init init_mm_internals(void)
@@ -2227,6 +2288,7 @@ void __init init_mm_internals(void)
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+ register_sysctl_init("vm", vmstat_table);
#endif
}
diff --git a/mm/workingset.c b/mm/workingset.c
index a4705e196545..6e7f4cb1b9a7 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio)
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
@@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
}
/*
* Tests if the shadow entry is for a folio that was recently evicted.
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
*/
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
int memcg_id;
- unsigned long min_seq;
+ unsigned long max_seq;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
@@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
memcg = mem_cgroup_from_id(memcg_id);
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
- min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
- return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+ max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
+ max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
+
+ return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
@@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
rcu_read_lock();
- recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
if (lruvec != folio_lruvec(folio))
goto unlock;
@@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
lrugen = &lruvec->lrugen;
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
- /* see the comment in folio_lru_refs() */
- refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
- tier = lru_tier_from_refs(refs);
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
+ tier = lru_tier_from_refs(refs, workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
- /*
- * Count the following two cases as stalls:
- * 1. For pages accessed through page tables, hotter pages pushed out
- * hot pages which refaulted immediately.
- * 2. For pages accessed multiple times through file descriptors,
- * they would have been protected by sort_folio().
- */
- if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
- set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
+ /* see folio_add_lru() where folio_set_active() will be called */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ if (workingset) {
+ folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
- }
+ } else
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
@@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
return false;
@@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
struct pglist_data *pgdat;
unsigned long eviction;
- rcu_read_lock();
-
if (lru_gen_enabled()) {
- bool recent = lru_gen_test_recent(shadow, file,
- &eviction_lruvec, &eviction, workingset);
+ bool recent;
+ rcu_read_lock();
+ recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
rcu_read_unlock();
return recent;
}
-
+ rcu_read_lock();
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
@@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() &&
- (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) {
- rcu_read_unlock();
- return false;
- }
-
+ if (!mem_cgroup_tryget(eviction_memcg))
+ eviction_memcg = NULL;
rcu_read_unlock();
+ if (!mem_cgroup_disabled() && !eviction_memcg)
+ return false;
/*
* Flush stats (and potentially sleep) outside the RCU read section.
*
@@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow)
bool workingset;
long nr;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
if (lru_gen_enabled()) {
lru_gen_refault(folio, shadow);
return;
@@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow)
* is actually experiencing the refault event. Make sure the folio is
* locked to guarantee folio_memcg() stability throughout.
*/
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
pgdat = folio_pgdat(folio);
@@ -615,7 +612,6 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
struct page *page = virt_to_page(node);
/*
@@ -626,8 +622,7 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/z3fold.c b/mm/z3fold.c
deleted file mode 100644
index 379d24b4fef9..000000000000
--- a/mm/z3fold.c
+++ /dev/null
@@ -1,1447 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * z3fold.c
- *
- * Author: Vitaly Wool <vitaly.wool@konsulko.com>
- * Copyright (C) 2016, Sony Mobile Communications Inc.
- *
- * This implementation is based on zbud written by Seth Jennings.
- *
- * z3fold is an special purpose allocator for storing compressed pages. It
- * can store up to three compressed pages per page which improves the
- * compression ratio of zbud while retaining its main concepts (e. g. always
- * storing an integral number of objects per page) and simplicity.
- * It still has simple and deterministic reclaim properties that make it
- * preferable to a higher density approach (with no requirement on integral
- * number of object per page) when reclaim is used.
- *
- * As in zbud, pages are divided into "chunks". The size of the chunks is
- * fixed at compile time and is determined by NCHUNKS_ORDER below.
- *
- * z3fold doesn't export any API and is meant to be used via zpool API.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/page-flags.h>
-#include <linux/migrate.h>
-#include <linux/node.h>
-#include <linux/compaction.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/zpool.h>
-#include <linux/kmemleak.h>
-
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS)
-
-#define BUDDY_MASK (0x3)
-#define BUDDY_SHIFT 2
-#define SLOTS_ALIGN (0x40)
-
-/*****************
- * Structures
-*****************/
-struct z3fold_pool;
-
-enum buddy {
- HEADLESS = 0,
- FIRST,
- MIDDLE,
- LAST,
- BUDDIES_MAX = LAST
-};
-
-struct z3fold_buddy_slots {
- /*
- * we are using BUDDY_MASK in handle_to_buddy etc. so there should
- * be enough slots to hold all possible variants
- */
- unsigned long slot[BUDDY_MASK + 1];
- unsigned long pool; /* back link */
- rwlock_t lock;
-};
-#define HANDLE_FLAG_MASK (0x03)
-
-/*
- * struct z3fold_header - z3fold page metadata occupying first chunks of each
- * z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the
- * pool
- * @page_lock: per-page lock
- * @refcount: reference count for the z3fold page
- * @work: work_struct for page layout optimization
- * @slots: pointer to the structure holding buddy slots
- * @pool: pointer to the containing pool
- * @cpu: CPU which this page "belongs" to
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- * @first_num: the starting number (for the first handle)
- * @mapped_count: the number of objects currently mapped
- */
-struct z3fold_header {
- struct list_head buddy;
- spinlock_t page_lock;
- struct kref refcount;
- struct work_struct work;
- struct z3fold_buddy_slots *slots;
- struct z3fold_pool *pool;
- short cpu;
- unsigned short first_chunks;
- unsigned short middle_chunks;
- unsigned short last_chunks;
- unsigned short start_middle;
- unsigned short first_num:2;
- unsigned short mapped_count:2;
- unsigned short foreign_handles:2;
-};
-
-/**
- * struct z3fold_pool - stores metadata for each z3fold pool
- * @name: pool name
- * @lock: protects pool unbuddied lists
- * @stale_lock: protects pool stale page list
- * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
- * buddies; the list each z3fold page is added to depends on
- * the size of its free region.
- * @stale: list of pages marked for freeing
- * @pages_nr: number of z3fold pages in the pool.
- * @c_handle: cache for z3fold_buddy_slots allocation
- * @compact_wq: workqueue for page layout background optimization
- * @release_wq: workqueue for safe page release
- * @work: work_struct for safe page release
- *
- * This structure is allocated at pool creation time and maintains metadata
- * pertaining to a particular z3fold pool.
- */
-struct z3fold_pool {
- const char *name;
- spinlock_t lock;
- spinlock_t stale_lock;
- struct list_head __percpu *unbuddied;
- struct list_head stale;
- atomic64_t pages_nr;
- struct kmem_cache *c_handle;
- struct workqueue_struct *compact_wq;
- struct workqueue_struct *release_wq;
- struct work_struct work;
-};
-
-/*
- * Internal z3fold page flags
- */
-enum z3fold_page_flags {
- PAGE_HEADLESS = 0,
- MIDDLE_CHUNK_MAPPED,
- NEEDS_COMPACTING,
- PAGE_STALE,
- PAGE_CLAIMED, /* by either reclaim or free */
- PAGE_MIGRATED, /* page is migrated and soon to be released */
-};
-
-/*
- * handle flags, go under HANDLE_FLAG_MASK
- */
-enum z3fold_handle_flags {
- HANDLES_NOFREE = 0,
-};
-
-/*
- * Forward declarations
- */
-static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
-static void compact_page_work(struct work_struct *w);
-
-/*****************
- * Helpers
-*****************/
-
-/* Converts an allocation size in bytes to size in z3fold chunks */
-static int size_to_chunks(size_t size)
-{
- return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-#define for_each_unbuddied_list(_iter, _begin) \
- for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-
-static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
- gfp_t gfp)
-{
- struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
- gfp);
-
- if (slots) {
- /* It will be freed separately in free_handle(). */
- kmemleak_not_leak(slots);
- slots->pool = (unsigned long)pool;
- rwlock_init(&slots->lock);
- }
-
- return slots;
-}
-
-static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
-{
- return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
-}
-
-static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
-{
- return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
-}
-
-/* Lock a z3fold page */
-static inline void z3fold_page_lock(struct z3fold_header *zhdr)
-{
- spin_lock(&zhdr->page_lock);
-}
-
-/* Try to lock a z3fold page */
-static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
-{
- return spin_trylock(&zhdr->page_lock);
-}
-
-/* Unlock a z3fold page */
-static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
-{
- spin_unlock(&zhdr->page_lock);
-}
-
-/* return locked z3fold page if it's not headless */
-static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
-{
- struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
- int locked = 0;
-
- if (!(handle & (1 << PAGE_HEADLESS))) {
- slots = handle_to_slots(handle);
- do {
- unsigned long addr;
-
- read_lock(&slots->lock);
- addr = *(unsigned long *)handle;
- zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- locked = z3fold_page_trylock(zhdr);
- read_unlock(&slots->lock);
- if (locked) {
- struct page *page = virt_to_page(zhdr);
-
- if (!test_bit(PAGE_MIGRATED, &page->private))
- break;
- z3fold_page_unlock(zhdr);
- }
- cpu_relax();
- } while (true);
- } else {
- zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
- }
-
- return zhdr;
-}
-
-static inline void put_z3fold_header(struct z3fold_header *zhdr)
-{
- struct page *page = virt_to_page(zhdr);
-
- if (!test_bit(PAGE_HEADLESS, &page->private))
- z3fold_page_unlock(zhdr);
-}
-
-static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
-{
- struct z3fold_buddy_slots *slots;
- int i;
- bool is_free;
-
- if (WARN_ON(*(unsigned long *)handle == 0))
- return;
-
- slots = handle_to_slots(handle);
- write_lock(&slots->lock);
- *(unsigned long *)handle = 0;
-
- if (test_bit(HANDLES_NOFREE, &slots->pool)) {
- write_unlock(&slots->lock);
- return; /* simple case, nothing else to do */
- }
-
- if (zhdr->slots != slots)
- zhdr->foreign_handles--;
-
- is_free = true;
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- write_unlock(&slots->lock);
-
- if (is_free) {
- struct z3fold_pool *pool = slots_to_pool(slots);
-
- if (zhdr->slots == slots)
- zhdr->slots = NULL;
- kmem_cache_free(pool->c_handle, slots);
- }
-}
-
-/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
- struct z3fold_pool *pool, gfp_t gfp)
-{
- struct z3fold_header *zhdr = page_address(page);
- struct z3fold_buddy_slots *slots;
-
- clear_bit(PAGE_HEADLESS, &page->private);
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- clear_bit(NEEDS_COMPACTING, &page->private);
- clear_bit(PAGE_STALE, &page->private);
- clear_bit(PAGE_CLAIMED, &page->private);
- clear_bit(PAGE_MIGRATED, &page->private);
- if (headless)
- return zhdr;
-
- slots = alloc_slots(pool, gfp);
- if (!slots)
- return NULL;
-
- memset(zhdr, 0, sizeof(*zhdr));
- spin_lock_init(&zhdr->page_lock);
- kref_init(&zhdr->refcount);
- zhdr->cpu = -1;
- zhdr->slots = slots;
- zhdr->pool = pool;
- INIT_LIST_HEAD(&zhdr->buddy);
- INIT_WORK(&zhdr->work, compact_page_work);
- return zhdr;
-}
-
-/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page, bool headless)
-{
- if (!headless) {
- lock_page(page);
- __ClearPageMovable(page);
- unlock_page(page);
- }
- __free_page(page);
-}
-
-/* Helper function to build the index */
-static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
-{
- return (bud + zhdr->first_num) & BUDDY_MASK;
-}
-
-/*
- * Encodes the handle of a particular buddy within a z3fold page.
- * Zhdr->page_lock should be held as this function accesses first_num
- * if bud != HEADLESS.
- */
-static unsigned long __encode_handle(struct z3fold_header *zhdr,
- struct z3fold_buddy_slots *slots,
- enum buddy bud)
-{
- unsigned long h = (unsigned long)zhdr;
- int idx = 0;
-
- /*
- * For a headless page, its handle is its pointer with the extra
- * PAGE_HEADLESS bit set
- */
- if (bud == HEADLESS)
- return h | (1 << PAGE_HEADLESS);
-
- /* otherwise, return pointer to encoded handle */
- idx = __idx(zhdr, bud);
- h += idx;
- if (bud == LAST)
- h |= (zhdr->last_chunks << BUDDY_SHIFT);
-
- write_lock(&slots->lock);
- slots->slot[idx] = h;
- write_unlock(&slots->lock);
- return (unsigned long)&slots->slot[idx];
-}
-
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
-{
- return __encode_handle(zhdr, zhdr->slots, bud);
-}
-
-/* only for LAST bud, returns zero otherwise */
-static unsigned short handle_to_chunks(unsigned long handle)
-{
- struct z3fold_buddy_slots *slots = handle_to_slots(handle);
- unsigned long addr;
-
- read_lock(&slots->lock);
- addr = *(unsigned long *)handle;
- read_unlock(&slots->lock);
- return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
-}
-
-/*
- * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
- * but that doesn't matter. because the masking will result in the
- * correct buddy number.
- */
-static enum buddy handle_to_buddy(unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct z3fold_buddy_slots *slots = handle_to_slots(handle);
- unsigned long addr;
-
- read_lock(&slots->lock);
- WARN_ON(handle & (1 << PAGE_HEADLESS));
- addr = *(unsigned long *)handle;
- read_unlock(&slots->lock);
- zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- return (addr - zhdr->first_num) & BUDDY_MASK;
-}
-
-static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
-{
- return zhdr->pool;
-}
-
-static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
-{
- struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
-
- WARN_ON(!list_empty(&zhdr->buddy));
- set_bit(PAGE_STALE, &page->private);
- clear_bit(NEEDS_COMPACTING, &page->private);
- spin_lock(&pool->lock);
- spin_unlock(&pool->lock);
-
- if (locked)
- z3fold_page_unlock(zhdr);
-
- spin_lock(&pool->stale_lock);
- list_add(&zhdr->buddy, &pool->stale);
- queue_work(pool->release_wq, &pool->work);
- spin_unlock(&pool->stale_lock);
-
- atomic64_dec(&pool->pages_nr);
-}
-
-static void release_z3fold_page_locked(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- WARN_ON(z3fold_page_trylock(zhdr));
- __release_z3fold_page(zhdr, true);
-}
-
-static void release_z3fold_page_locked_list(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
-
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- WARN_ON(z3fold_page_trylock(zhdr));
- __release_z3fold_page(zhdr, true);
-}
-
-static inline int put_z3fold_locked(struct z3fold_header *zhdr)
-{
- return kref_put(&zhdr->refcount, release_z3fold_page_locked);
-}
-
-static inline int put_z3fold_locked_list(struct z3fold_header *zhdr)
-{
- return kref_put(&zhdr->refcount, release_z3fold_page_locked_list);
-}
-
-static void free_pages_work(struct work_struct *w)
-{
- struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
-
- spin_lock(&pool->stale_lock);
- while (!list_empty(&pool->stale)) {
- struct z3fold_header *zhdr = list_first_entry(&pool->stale,
- struct z3fold_header, buddy);
- struct page *page = virt_to_page(zhdr);
-
- list_del(&zhdr->buddy);
- if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
- continue;
- spin_unlock(&pool->stale_lock);
- cancel_work_sync(&zhdr->work);
- free_z3fold_page(page, false);
- cond_resched();
- spin_lock(&pool->stale_lock);
- }
- spin_unlock(&pool->stale_lock);
-}
-
-/*
- * Returns the number of free chunks in a z3fold page.
- * NB: can't be used with HEADLESS pages.
- */
-static int num_free_chunks(struct z3fold_header *zhdr)
-{
- int nfree;
- /*
- * If there is a middle object, pick up the bigger free space
- * either before or after it. Otherwise just subtract the number
- * of chunks occupied by the first and the last objects.
- */
- if (zhdr->middle_chunks != 0) {
- int nfree_before = zhdr->first_chunks ?
- 0 : zhdr->start_middle - ZHDR_CHUNKS;
- int nfree_after = zhdr->last_chunks ?
- 0 : TOTAL_CHUNKS -
- (zhdr->start_middle + zhdr->middle_chunks);
- nfree = max(nfree_before, nfree_after);
- } else
- nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
- return nfree;
-}
-
-/* Add to the appropriate unbuddied list */
-static inline void add_to_unbuddied(struct z3fold_pool *pool,
- struct z3fold_header *zhdr)
-{
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied;
- int freechunks = num_free_chunks(zhdr);
-
- migrate_disable();
- unbuddied = this_cpu_ptr(pool->unbuddied);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- migrate_enable();
- }
-}
-
-static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
-{
- enum buddy bud = HEADLESS;
-
- if (zhdr->middle_chunks) {
- if (!zhdr->first_chunks &&
- chunks <= zhdr->start_middle - ZHDR_CHUNKS)
- bud = FIRST;
- else if (!zhdr->last_chunks)
- bud = LAST;
- } else {
- if (!zhdr->first_chunks)
- bud = FIRST;
- else if (!zhdr->last_chunks)
- bud = LAST;
- else
- bud = MIDDLE;
- }
-
- return bud;
-}
-
-static inline void *mchunk_memmove(struct z3fold_header *zhdr,
- unsigned short dst_chunk)
-{
- void *beg = zhdr;
- return memmove(beg + (dst_chunk << CHUNK_SHIFT),
- beg + (zhdr->start_middle << CHUNK_SHIFT),
- zhdr->middle_chunks << CHUNK_SHIFT);
-}
-
-static inline bool buddy_single(struct z3fold_header *zhdr)
-{
- return !((zhdr->first_chunks && zhdr->middle_chunks) ||
- (zhdr->first_chunks && zhdr->last_chunks) ||
- (zhdr->middle_chunks && zhdr->last_chunks));
-}
-
-static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
-{
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- void *p = zhdr;
- unsigned long old_handle = 0;
- size_t sz = 0;
- struct z3fold_header *new_zhdr = NULL;
- int first_idx = __idx(zhdr, FIRST);
- int middle_idx = __idx(zhdr, MIDDLE);
- int last_idx = __idx(zhdr, LAST);
- unsigned short *moved_chunks = NULL;
-
- /*
- * No need to protect slots here -- all the slots are "local" and
- * the page lock is already taken
- */
- if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
- p += ZHDR_SIZE_ALIGNED;
- sz = zhdr->first_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
- moved_chunks = &zhdr->first_chunks;
- } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
- p += zhdr->start_middle << CHUNK_SHIFT;
- sz = zhdr->middle_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
- moved_chunks = &zhdr->middle_chunks;
- } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
- p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
- sz = zhdr->last_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
- moved_chunks = &zhdr->last_chunks;
- }
-
- if (sz > 0) {
- enum buddy new_bud = HEADLESS;
- short chunks = size_to_chunks(sz);
- void *q;
-
- new_zhdr = __z3fold_alloc(pool, sz, false);
- if (!new_zhdr)
- return NULL;
-
- if (WARN_ON(new_zhdr == zhdr))
- goto out_fail;
-
- new_bud = get_free_buddy(new_zhdr, chunks);
- q = new_zhdr;
- switch (new_bud) {
- case FIRST:
- new_zhdr->first_chunks = chunks;
- q += ZHDR_SIZE_ALIGNED;
- break;
- case MIDDLE:
- new_zhdr->middle_chunks = chunks;
- new_zhdr->start_middle =
- new_zhdr->first_chunks + ZHDR_CHUNKS;
- q += new_zhdr->start_middle << CHUNK_SHIFT;
- break;
- case LAST:
- new_zhdr->last_chunks = chunks;
- q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
- break;
- default:
- goto out_fail;
- }
- new_zhdr->foreign_handles++;
- memcpy(q, p, sz);
- write_lock(&zhdr->slots->lock);
- *(unsigned long *)old_handle = (unsigned long)new_zhdr +
- __idx(new_zhdr, new_bud);
- if (new_bud == LAST)
- *(unsigned long *)old_handle |=
- (new_zhdr->last_chunks << BUDDY_SHIFT);
- write_unlock(&zhdr->slots->lock);
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
-
- *moved_chunks = 0;
- }
-
- return new_zhdr;
-
-out_fail:
- if (new_zhdr && !put_z3fold_locked(new_zhdr)) {
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
- }
- return NULL;
-
-}
-
-#define BIG_CHUNK_GAP 3
-/* Has to be called with lock held */
-static int z3fold_compact_page(struct z3fold_header *zhdr)
-{
- struct page *page = virt_to_page(zhdr);
-
- if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
- return 0; /* can't move middle chunk, it's used */
-
- if (unlikely(PageIsolated(page)))
- return 0;
-
- if (zhdr->middle_chunks == 0)
- return 0; /* nothing to compact */
-
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /* move to the beginning */
- mchunk_memmove(zhdr, ZHDR_CHUNKS);
- zhdr->first_chunks = zhdr->middle_chunks;
- zhdr->middle_chunks = 0;
- zhdr->start_middle = 0;
- zhdr->first_num++;
- return 1;
- }
-
- /*
- * moving data is expensive, so let's only do that if
- * there's substantial gain (at least BIG_CHUNK_GAP chunks)
- */
- if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
- zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
- BIG_CHUNK_GAP) {
- mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
- zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
- return 1;
- } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
- TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
- + zhdr->middle_chunks) >=
- BIG_CHUNK_GAP) {
- unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
- zhdr->middle_chunks;
- mchunk_memmove(zhdr, new_start);
- zhdr->start_middle = new_start;
- return 1;
- }
-
- return 0;
-}
-
-static void do_compact_page(struct z3fold_header *zhdr, bool locked)
-{
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- struct page *page;
-
- page = virt_to_page(zhdr);
- if (locked)
- WARN_ON(z3fold_page_trylock(zhdr));
- else
- z3fold_page_lock(zhdr);
- if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
- z3fold_page_unlock(zhdr);
- return;
- }
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- if (put_z3fold_locked(zhdr))
- return;
-
- if (test_bit(PAGE_STALE, &page->private) ||
- test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- return;
- }
-
- if (!zhdr->foreign_handles && buddy_single(zhdr) &&
- zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (!put_z3fold_locked(zhdr)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- }
- return;
- }
-
- z3fold_compact_page(zhdr);
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
-}
-
-static void compact_page_work(struct work_struct *w)
-{
- struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
- work);
-
- do_compact_page(zhdr, false);
-}
-
-/* returns _locked_ z3fold page header or NULL */
-static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
- size_t size, bool can_sleep)
-{
- struct z3fold_header *zhdr = NULL;
- struct page *page;
- struct list_head *unbuddied;
- int chunks = size_to_chunks(size), i;
-
-lookup:
- migrate_disable();
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = this_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- zhdr = NULL;
- migrate_enable();
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- migrate_enable();
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- migrate_enable();
-
- if (!zhdr) {
- int cpu;
-
- /* look for _exact_ match on other cpus' lists */
- for_each_online_cpu(cpu) {
- struct list_head *l;
-
- unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
- spin_lock(&pool->lock);
- l = &unbuddied[chunks];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr || !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- zhdr = NULL;
- continue;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- if (can_sleep)
- cond_resched();
- continue;
- }
- kref_get(&zhdr->refcount);
- break;
- }
- }
-
- if (zhdr && !zhdr->slots) {
- zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
- if (!zhdr->slots)
- goto out_fail;
- }
- return zhdr;
-
-out_fail:
- if (!put_z3fold_locked(zhdr)) {
- add_to_unbuddied(pool, zhdr);
- z3fold_page_unlock(zhdr);
- }
- return NULL;
-}
-
-/*
- * API Functions
- */
-
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @name: pool name
- * @gfp: gfp flags when allocating the z3fold pool structure
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
-{
- struct z3fold_pool *pool = NULL;
- int i, cpu;
-
- pool = kzalloc(sizeof(struct z3fold_pool), gfp);
- if (!pool)
- goto out;
- pool->c_handle = kmem_cache_create("z3fold_handle",
- sizeof(struct z3fold_buddy_slots),
- SLOTS_ALIGN, 0, NULL);
- if (!pool->c_handle)
- goto out_c;
- spin_lock_init(&pool->lock);
- spin_lock_init(&pool->stale_lock);
- pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
- __alignof__(struct list_head));
- if (!pool->unbuddied)
- goto out_pool;
- for_each_possible_cpu(cpu) {
- struct list_head *unbuddied =
- per_cpu_ptr(pool->unbuddied, cpu);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&unbuddied[i]);
- }
- INIT_LIST_HEAD(&pool->stale);
- atomic64_set(&pool->pages_nr, 0);
- pool->name = name;
- pool->compact_wq = create_singlethread_workqueue(pool->name);
- if (!pool->compact_wq)
- goto out_unbuddied;
- pool->release_wq = create_singlethread_workqueue(pool->name);
- if (!pool->release_wq)
- goto out_wq;
- INIT_WORK(&pool->work, free_pages_work);
- return pool;
-
-out_wq:
- destroy_workqueue(pool->compact_wq);
-out_unbuddied:
- free_percpu(pool->unbuddied);
-out_pool:
- kmem_cache_destroy(pool->c_handle);
-out_c:
- kfree(pool);
-out:
- return NULL;
-}
-
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool: the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
- kmem_cache_destroy(pool->c_handle);
-
- /*
- * We need to destroy pool->compact_wq before pool->release_wq,
- * as any pending work on pool->compact_wq will call
- * queue_work(pool->release_wq, &pool->work).
- *
- * There are still outstanding pages until both workqueues are drained,
- * so we cannot unregister migration until then.
- */
-
- destroy_workqueue(pool->compact_wq);
- destroy_workqueue(pool->release_wq);
- free_percpu(pool->unbuddied);
- kfree(pool);
-}
-
-static const struct movable_operations z3fold_mops;
-
-/**
- * z3fold_alloc() - allocates a region of a given size
- * @pool: z3fold pool from which to allocate
- * @size: size in bytes of the desired allocation
- * @gfp: gfp flags used if the pool needs to grow
- * @handle: handle of the new allocation
- *
- * This function will attempt to find a free region in the pool large enough to
- * satisfy the allocation request. A search of the unbuddied lists is
- * performed first. If no suitable free region is found, then a new page is
- * allocated and added to the pool to satisfy the request.
- *
- * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
- * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
- * a new page.
- */
-static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- int chunks = size_to_chunks(size);
- struct z3fold_header *zhdr = NULL;
- struct page *page = NULL;
- enum buddy bud;
- bool can_sleep = gfpflags_allow_blocking(gfp);
-
- if (!size || (gfp & __GFP_HIGHMEM))
- return -EINVAL;
-
- if (size > PAGE_SIZE)
- return -ENOSPC;
-
- if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
- bud = HEADLESS;
- else {
-retry:
- zhdr = __z3fold_alloc(pool, size, can_sleep);
- if (zhdr) {
- bud = get_free_buddy(zhdr, chunks);
- if (bud == HEADLESS) {
- if (!put_z3fold_locked(zhdr))
- z3fold_page_unlock(zhdr);
- pr_err("No free chunks in unbuddied\n");
- WARN_ON(1);
- goto retry;
- }
- page = virt_to_page(zhdr);
- goto found;
- }
- bud = FIRST;
- }
-
- page = alloc_page(gfp);
- if (!page)
- return -ENOMEM;
-
- zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
- if (!zhdr) {
- __free_page(page);
- return -ENOMEM;
- }
- atomic64_inc(&pool->pages_nr);
-
- if (bud == HEADLESS) {
- set_bit(PAGE_HEADLESS, &page->private);
- goto headless;
- }
- if (can_sleep) {
- lock_page(page);
- __SetPageMovable(page, &z3fold_mops);
- unlock_page(page);
- } else {
- WARN_ON(!trylock_page(page));
- __SetPageMovable(page, &z3fold_mops);
- unlock_page(page);
- }
- z3fold_page_lock(zhdr);
-
-found:
- if (bud == FIRST)
- zhdr->first_chunks = chunks;
- else if (bud == LAST)
- zhdr->last_chunks = chunks;
- else {
- zhdr->middle_chunks = chunks;
- zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
- }
- add_to_unbuddied(pool, zhdr);
-
-headless:
- spin_lock(&pool->lock);
- *handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
- if (bud != HEADLESS)
- z3fold_page_unlock(zhdr);
-
- return 0;
-}
-
-/**
- * z3fold_free() - frees the allocation associated with the given handle
- * @pool: pool in which the allocation resided
- * @handle: handle associated with the allocation returned by z3fold_alloc()
- *
- * In the case that the z3fold page in which the allocation resides is under
- * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
- * only sets the first|middle|last_chunks to 0. The page is actually freed
- * once all buddies are evicted (see z3fold_reclaim_page() below).
- */
-static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- enum buddy bud;
- bool page_claimed;
-
- zhdr = get_z3fold_header(handle);
- page = virt_to_page(zhdr);
- page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
-
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- /* if a headless page is under reclaim, just leave.
- * NB: we use test_and_set_bit for a reason: if the bit
- * has not been set before, we release this page
- * immediately so we don't care about its value any more.
- */
- if (!page_claimed) {
- put_z3fold_header(zhdr);
- free_z3fold_page(page, true);
- atomic64_dec(&pool->pages_nr);
- }
- return;
- }
-
- /* Non-headless case */
- bud = handle_to_buddy(handle);
-
- switch (bud) {
- case FIRST:
- zhdr->first_chunks = 0;
- break;
- case MIDDLE:
- zhdr->middle_chunks = 0;
- break;
- case LAST:
- zhdr->last_chunks = 0;
- break;
- default:
- pr_err("%s: unknown bud %d\n", __func__, bud);
- WARN_ON(1);
- put_z3fold_header(zhdr);
- return;
- }
-
- if (!page_claimed)
- free_handle(handle, zhdr);
- if (put_z3fold_locked_list(zhdr))
- return;
- if (page_claimed) {
- /* the page has not been claimed by us */
- put_z3fold_header(zhdr);
- return;
- }
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- put_z3fold_header(zhdr);
- return;
- }
- if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
- zhdr->cpu = -1;
- kref_get(&zhdr->refcount);
- clear_bit(PAGE_CLAIMED, &page->private);
- do_compact_page(zhdr, true);
- return;
- }
- kref_get(&zhdr->refcount);
- clear_bit(PAGE_CLAIMED, &page->private);
- queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
- put_z3fold_header(zhdr);
-}
-
-/**
- * z3fold_map() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be mapped
- *
- * Extracts the buddy number from handle and constructs the pointer to the
- * correct starting chunk within the page.
- *
- * Returns: a pointer to the mapped allocation
- */
-static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- void *addr;
- enum buddy buddy;
-
- zhdr = get_z3fold_header(handle);
- addr = zhdr;
- page = virt_to_page(zhdr);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- goto out;
-
- buddy = handle_to_buddy(handle);
- switch (buddy) {
- case FIRST:
- addr += ZHDR_SIZE_ALIGNED;
- break;
- case MIDDLE:
- addr += zhdr->start_middle << CHUNK_SHIFT;
- set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- break;
- case LAST:
- addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
- break;
- default:
- pr_err("unknown buddy id %d\n", buddy);
- WARN_ON(1);
- addr = NULL;
- break;
- }
-
- if (addr)
- zhdr->mapped_count++;
-out:
- put_z3fold_header(zhdr);
- return addr;
-}
-
-/**
- * z3fold_unmap() - unmaps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be unmapped
- */
-static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- enum buddy buddy;
-
- zhdr = get_z3fold_header(handle);
- page = virt_to_page(zhdr);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- return;
-
- buddy = handle_to_buddy(handle);
- if (buddy == MIDDLE)
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- zhdr->mapped_count--;
- put_z3fold_header(zhdr);
-}
-
-/**
- * z3fold_get_pool_pages() - gets the z3fold pool size in pages
- * @pool: pool whose size is being queried
- *
- * Returns: size in pages of the given pool.
- */
-static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
-{
- return atomic64_read(&pool->pages_nr);
-}
-
-static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
-{
- struct z3fold_header *zhdr;
- struct z3fold_pool *pool;
-
- VM_BUG_ON_PAGE(PageIsolated(page), page);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- return false;
-
- zhdr = page_address(page);
- z3fold_page_lock(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_STALE, &page->private))
- goto out;
-
- if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
- goto out;
-
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
- goto out;
- pool = zhdr_to_pool(zhdr);
- spin_lock(&pool->lock);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- kref_get(&zhdr->refcount);
- z3fold_page_unlock(zhdr);
- return true;
-
-out:
- z3fold_page_unlock(zhdr);
- return false;
-}
-
-static int z3fold_page_migrate(struct page *newpage, struct page *page,
- enum migrate_mode mode)
-{
- struct z3fold_header *zhdr, *new_zhdr;
- struct z3fold_pool *pool;
-
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-
- zhdr = page_address(page);
- pool = zhdr_to_pool(zhdr);
-
- if (!z3fold_page_trylock(zhdr))
- return -EAGAIN;
- if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- return -EBUSY;
- }
- if (work_pending(&zhdr->work)) {
- z3fold_page_unlock(zhdr);
- return -EAGAIN;
- }
- new_zhdr = page_address(newpage);
- memcpy(new_zhdr, zhdr, PAGE_SIZE);
- newpage->private = page->private;
- set_bit(PAGE_MIGRATED, &page->private);
- z3fold_page_unlock(zhdr);
- spin_lock_init(&new_zhdr->page_lock);
- INIT_WORK(&new_zhdr->work, compact_page_work);
- /*
- * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
- * so we only have to reinitialize it.
- */
- INIT_LIST_HEAD(&new_zhdr->buddy);
- __ClearPageMovable(page);
-
- get_page(newpage);
- z3fold_page_lock(new_zhdr);
- if (new_zhdr->first_chunks)
- encode_handle(new_zhdr, FIRST);
- if (new_zhdr->last_chunks)
- encode_handle(new_zhdr, LAST);
- if (new_zhdr->middle_chunks)
- encode_handle(new_zhdr, MIDDLE);
- set_bit(NEEDS_COMPACTING, &newpage->private);
- new_zhdr->cpu = smp_processor_id();
- __SetPageMovable(newpage, &z3fold_mops);
- z3fold_page_unlock(new_zhdr);
-
- queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
-
- /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
- page->private = 0;
- put_page(page);
- return 0;
-}
-
-static void z3fold_page_putback(struct page *page)
-{
- struct z3fold_header *zhdr;
- struct z3fold_pool *pool;
-
- zhdr = page_address(page);
- pool = zhdr_to_pool(zhdr);
-
- z3fold_page_lock(zhdr);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- INIT_LIST_HEAD(&page->lru);
- if (put_z3fold_locked(zhdr))
- return;
- if (list_empty(&zhdr->buddy))
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
-}
-
-static const struct movable_operations z3fold_mops = {
- .isolate_page = z3fold_page_isolate,
- .migrate_page = z3fold_page_migrate,
- .putback_page = z3fold_page_putback,
-};
-
-/*****************
- * zpool
- ****************/
-
-static void *z3fold_zpool_create(const char *name, gfp_t gfp)
-{
- return z3fold_create_pool(name, gfp);
-}
-
-static void z3fold_zpool_destroy(void *pool)
-{
- z3fold_destroy_pool(pool);
-}
-
-static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return z3fold_alloc(pool, size, gfp, handle);
-}
-static void z3fold_zpool_free(void *pool, unsigned long handle)
-{
- z3fold_free(pool, handle);
-}
-
-static void *z3fold_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return z3fold_map(pool, handle);
-}
-static void z3fold_zpool_unmap(void *pool, unsigned long handle)
-{
- z3fold_unmap(pool, handle);
-}
-
-static u64 z3fold_zpool_total_pages(void *pool)
-{
- return z3fold_get_pool_pages(pool);
-}
-
-static struct zpool_driver z3fold_zpool_driver = {
- .type = "z3fold",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = z3fold_zpool_create,
- .destroy = z3fold_zpool_destroy,
- .malloc = z3fold_zpool_malloc,
- .free = z3fold_zpool_free,
- .map = z3fold_zpool_map,
- .unmap = z3fold_zpool_unmap,
- .total_pages = z3fold_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-z3fold");
-
-static int __init init_z3fold(void)
-{
- /*
- * Make sure the z3fold header is not larger than the page size and
- * there has remaining spaces for its buddy.
- */
- BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
- zpool_register_driver(&z3fold_zpool_driver);
-
- return 0;
-}
-
-static void __exit exit_z3fold(void)
-{
- zpool_unregister_driver(&z3fold_zpool_driver);
-}
-
-module_init(init_z3fold);
-module_exit(exit_z3fold);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
-MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
diff --git a/mm/zbud.c b/mm/zbud.c
deleted file mode 100644
index e9836fff9438..000000000000
--- a/mm/zbud.c
+++ /dev/null
@@ -1,455 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * zbud.c
- *
- * Copyright (C) 2013, Seth Jennings, IBM
- *
- * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
- *
- * zbud is an special purpose allocator for storing compressed pages. Contrary
- * to what its name may suggest, zbud is not a buddy allocator, but rather an
- * allocator that "buddies" two compressed pages together in a single memory
- * page.
- *
- * While this design limits storage density, it has simple and deterministic
- * reclaim properties that make it preferable to a higher density approach when
- * reclaim will be used.
- *
- * zbud works by storing compressed pages, or "zpages", together in pairs in a
- * single memory page called a "zbud page". The first buddy is "left
- * justified" at the beginning of the zbud page, and the last buddy is "right
- * justified" at the end of the zbud page. The benefit is that if either
- * buddy is freed, the freed buddy space, coalesced with whatever slack space
- * that existed between the buddies, results in the largest possible free region
- * within the zbud page.
- *
- * zbud also provides an attractive lower bound on density. The ratio of zpages
- * to zbud pages can not be less than 1. This ensures that zbud can never "do
- * harm" by using more pages to store zpages than the uncompressed zpages would
- * have used on their own.
- *
- * zbud pages are divided into "chunks". The size of the chunks is fixed at
- * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
- * into chunks allows organizing unbuddied zbud pages into a manageable number
- * of unbuddied lists according to the number of free chunks available in the
- * zbud page.
- *
- * The zbud API differs from that of conventional allocators in that the
- * allocation function, zbud_alloc(), returns an opaque handle to the user,
- * not a dereferenceable pointer. The user must map the handle using
- * zbud_map() in order to get a usable pointer by which to access the
- * allocation data and unmap the handle with zbud_unmap() when operations
- * on the allocation data are complete.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/atomic.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/zpool.h>
-
-/*****************
- * Structures
-*****************/
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
- * 63 which shows the max number of free chunks in zbud page, also there will be
- * 63 freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-struct zbud_pool;
-
-/**
- * struct zbud_pool - stores metadata for each zbud pool
- * @lock: protects all pool fields and first|last_chunk fields of any
- * zbud page in the pool
- * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
- * the lists each zbud page is added to depends on the size of
- * its free region.
- * @buddied: list tracking the zbud pages that contain two buddies;
- * these zbud pages are full
- * @pages_nr: number of zbud pages in the pool.
- *
- * This structure is allocated at pool creation time and maintains metadata
- * pertaining to a particular zbud pool.
- */
-struct zbud_pool {
- spinlock_t lock;
- union {
- /*
- * Reuse unbuddied[0] as buddied on the ground that
- * unbuddied[0] is unused.
- */
- struct list_head buddied;
- struct list_head unbuddied[NCHUNKS];
- };
- u64 pages_nr;
-};
-
-/*
- * struct zbud_header - zbud page metadata occupying the first chunk of each
- * zbud page.
- * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- */
-struct zbud_header {
- struct list_head buddy;
- unsigned int first_chunks;
- unsigned int last_chunks;
-};
-
-/*****************
- * Helpers
-*****************/
-/* Just to make the code easier to read */
-enum buddy {
- FIRST,
- LAST
-};
-
-/* Converts an allocation size in bytes to size in zbud chunks */
-static int size_to_chunks(size_t size)
-{
- return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-#define for_each_unbuddied_list(_iter, _begin) \
- for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-
-/* Initializes the zbud header of a newly allocated zbud page */
-static struct zbud_header *init_zbud_page(struct page *page)
-{
- struct zbud_header *zhdr = page_address(page);
- zhdr->first_chunks = 0;
- zhdr->last_chunks = 0;
- INIT_LIST_HEAD(&zhdr->buddy);
- return zhdr;
-}
-
-/* Resets the struct page fields and frees the page */
-static void free_zbud_page(struct zbud_header *zhdr)
-{
- __free_page(virt_to_page(zhdr));
-}
-
-/*
- * Encodes the handle of a particular buddy within a zbud page
- * Pool lock should be held as this function accesses first|last_chunks
- */
-static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
-{
- unsigned long handle;
-
- /*
- * For now, the encoded handle is actually just the pointer to the data
- * but this might not always be the case. A little information hiding.
- * Add CHUNK_SIZE to the handle if it is the first allocation to jump
- * over the zbud header in the first chunk.
- */
- handle = (unsigned long)zhdr;
- if (bud == FIRST)
- /* skip over zbud header */
- handle += ZHDR_SIZE_ALIGNED;
- else /* bud == LAST */
- handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
- return handle;
-}
-
-/* Returns the zbud page where a given handle is stored */
-static struct zbud_header *handle_to_zbud_header(unsigned long handle)
-{
- return (struct zbud_header *)(handle & PAGE_MASK);
-}
-
-/* Returns the number of free chunks in a zbud page */
-static int num_free_chunks(struct zbud_header *zhdr)
-{
- /*
- * Rather than branch for different situations, just use the fact that
- * free buddies have a length of zero to simplify everything.
- */
- return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
-}
-
-/*****************
- * API Functions
-*****************/
-/**
- * zbud_create_pool() - create a new zbud pool
- * @gfp: gfp flags when allocating the zbud pool structure
- *
- * Return: pointer to the new zbud pool or NULL if the metadata allocation
- * failed.
- */
-static struct zbud_pool *zbud_create_pool(gfp_t gfp)
-{
- struct zbud_pool *pool;
- int i;
-
- pool = kzalloc(sizeof(struct zbud_pool), gfp);
- if (!pool)
- return NULL;
- spin_lock_init(&pool->lock);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&pool->unbuddied[i]);
- INIT_LIST_HEAD(&pool->buddied);
- pool->pages_nr = 0;
- return pool;
-}
-
-/**
- * zbud_destroy_pool() - destroys an existing zbud pool
- * @pool: the zbud pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void zbud_destroy_pool(struct zbud_pool *pool)
-{
- kfree(pool);
-}
-
-/**
- * zbud_alloc() - allocates a region of a given size
- * @pool: zbud pool from which to allocate
- * @size: size in bytes of the desired allocation
- * @gfp: gfp flags used if the pool needs to grow
- * @handle: handle of the new allocation
- *
- * This function will attempt to find a free region in the pool large enough to
- * satisfy the allocation request. A search of the unbuddied lists is
- * performed first. If no suitable free region is found, then a new page is
- * allocated and added to the pool to satisfy the request.
- *
- * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
- * as zbud pool pages.
- *
- * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
- * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
- * a new page.
- */
-static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- int chunks, i, freechunks;
- struct zbud_header *zhdr = NULL;
- enum buddy bud;
- struct page *page;
-
- if (!size || (gfp & __GFP_HIGHMEM))
- return -EINVAL;
- if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
- return -ENOSPC;
- chunks = size_to_chunks(size);
- spin_lock(&pool->lock);
-
- /* First, try to find an unbuddied zbud page. */
- for_each_unbuddied_list(i, chunks) {
- if (!list_empty(&pool->unbuddied[i])) {
- zhdr = list_first_entry(&pool->unbuddied[i],
- struct zbud_header, buddy);
- list_del(&zhdr->buddy);
- if (zhdr->first_chunks == 0)
- bud = FIRST;
- else
- bud = LAST;
- goto found;
- }
- }
-
- /* Couldn't find unbuddied zbud page, create new one */
- spin_unlock(&pool->lock);
- page = alloc_page(gfp);
- if (!page)
- return -ENOMEM;
- spin_lock(&pool->lock);
- pool->pages_nr++;
- zhdr = init_zbud_page(page);
- bud = FIRST;
-
-found:
- if (bud == FIRST)
- zhdr->first_chunks = chunks;
- else
- zhdr->last_chunks = chunks;
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* Add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
- }
-
- *handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
-
- return 0;
-}
-
-/**
- * zbud_free() - frees the allocation associated with the given handle
- * @pool: pool in which the allocation resided
- * @handle: handle associated with the allocation returned by zbud_alloc()
- */
-static void zbud_free(struct zbud_pool *pool, unsigned long handle)
-{
- struct zbud_header *zhdr;
- int freechunks;
-
- spin_lock(&pool->lock);
- zhdr = handle_to_zbud_header(handle);
-
- /* If first buddy, handle will be page aligned */
- if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
- zhdr->last_chunks = 0;
- else
- zhdr->first_chunks = 0;
-
- /* Remove from existing buddy list */
- list_del(&zhdr->buddy);
-
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /* zbud page is empty, free */
- free_zbud_page(zhdr);
- pool->pages_nr--;
- } else {
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- }
-
- spin_unlock(&pool->lock);
-}
-
-/**
- * zbud_map() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be mapped
- *
- * While trivial for zbud, the mapping functions for others allocators
- * implementing this allocation API could have more complex information encoded
- * in the handle and could create temporary mappings to make the data
- * accessible to the user.
- *
- * Returns: a pointer to the mapped allocation
- */
-static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
-{
- return (void *)(handle);
-}
-
-/**
- * zbud_unmap() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be unmapped
- */
-static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
-{
-}
-
-/**
- * zbud_get_pool_pages() - gets the zbud pool size in pages
- * @pool: pool whose size is being queried
- *
- * Returns: size in pages of the given pool. The pool lock need not be
- * taken to access pages_nr.
- */
-static u64 zbud_get_pool_pages(struct zbud_pool *pool)
-{
- return pool->pages_nr;
-}
-
-/*****************
- * zpool
- ****************/
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp)
-{
- return zbud_create_pool(gfp);
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
- zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
- zbud_free(pool, handle);
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
- zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_pages(void *pool)
-{
- return zbud_get_pool_pages(pool);
-}
-
-static struct zpool_driver zbud_zpool_driver = {
- .type = "zbud",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = zbud_zpool_create,
- .destroy = zbud_zpool_destroy,
- .malloc = zbud_zpool_malloc,
- .free = zbud_zpool_free,
- .map = zbud_zpool_map,
- .unmap = zbud_zpool_unmap,
- .total_pages = zbud_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-zbud");
-
-static int __init init_zbud(void)
-{
- /* Make sure the zbud header will fit in one chunk */
- BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
- pr_info("loaded\n");
-
- zpool_register_driver(&zbud_zpool_driver);
-
- return 0;
-}
-
-static void __exit exit_zbud(void)
-{
- zpool_unregister_driver(&zbud_zpool_driver);
- pr_info("unloaded\n");
-}
-
-module_init(init_zbud);
-module_exit(exit_zbud);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
-MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
new file mode 100644
index 000000000000..25bf5ea0beb8
--- /dev/null
+++ b/mm/zpdesc.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* zpdesc.h: zswap.zpool memory descriptor
+ *
+ * Written by Alex Shi <alexs@kernel.org>
+ * Hyeonggon Yoo <42.hyeyoo@gmail.com>
+ */
+#ifndef __MM_ZPDESC_H__
+#define __MM_ZPDESC_H__
+
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
+/*
+ * struct zpdesc - Memory descriptor for zpool memory.
+ * @flags: Page flags, mostly unused by zsmalloc.
+ * @lru: Indirectly used by page migration.
+ * @movable_ops: Used by page migration.
+ * @next: Next zpdesc in a zspage in zsmalloc zpool.
+ * @handle: For huge zspage in zsmalloc zpool.
+ * @zspage: Points to the zspage this zpdesc is a part of.
+ * @first_obj_offset: First object offset in zsmalloc zpool.
+ * @_refcount: The number of references to this zpdesc.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues. In particular, do not expand into the overlap
+ * with memcg_data.
+ *
+ * Page flags used:
+ * * PG_private identifies the first component page.
+ * * PG_locked is used by page migration code.
+ */
+struct zpdesc {
+ unsigned long flags;
+ struct list_head lru;
+ unsigned long movable_ops;
+ union {
+ struct zpdesc *next;
+ unsigned long handle;
+ };
+ struct zspage *zspage;
+ /*
+ * Only the lower 24 bits are available for offset, limiting a page
+ * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc.
+ *
+ * Do not access this field directly.
+ * Instead, use {get,set}_first_obj_offset() helpers.
+ */
+ unsigned int first_obj_offset;
+ atomic_t _refcount;
+};
+#define ZPDESC_MATCH(pg, zp) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp))
+
+ZPDESC_MATCH(flags, flags);
+ZPDESC_MATCH(lru, lru);
+ZPDESC_MATCH(mapping, movable_ops);
+ZPDESC_MATCH(__folio_index, next);
+ZPDESC_MATCH(__folio_index, handle);
+ZPDESC_MATCH(private, zspage);
+ZPDESC_MATCH(page_type, first_obj_offset);
+ZPDESC_MATCH(_refcount, _refcount);
+#undef ZPDESC_MATCH
+static_assert(sizeof(struct zpdesc) <= sizeof(struct page));
+
+/*
+ * zpdesc_page - The first struct page allocated for a zpdesc
+ * @zp: The zpdesc.
+ *
+ * A convenience wrapper for converting zpdesc to the first struct page of the
+ * underlying folio, to communicate with code not yet converted to folio or
+ * struct zpdesc.
+ *
+ */
+#define zpdesc_page(zp) (_Generic((zp), \
+ const struct zpdesc *: (const struct page *)(zp), \
+ struct zpdesc *: (struct page *)(zp)))
+
+/**
+ * zpdesc_folio - The folio allocated for a zpdesc
+ * @zp: The zpdesc.
+ *
+ * Zpdescs are descriptors for zpool memory. The zpool memory itself is
+ * allocated as folios that contain the zpool objects, and zpdesc uses specific
+ * fields in the first struct page of the folio - those fields are now accessed
+ * by struct zpdesc.
+ *
+ * It is occasionally necessary convert to back to a folio in order to
+ * communicate with the rest of the mm. Please use this helper function
+ * instead of casting yourself, as the implementation may change in the future.
+ */
+#define zpdesc_folio(zp) (_Generic((zp), \
+ const struct zpdesc *: (const struct folio *)(zp), \
+ struct zpdesc *: (struct folio *)(zp)))
+/**
+ * page_zpdesc - Converts from first struct page to zpdesc.
+ * @p: The first (either head of compound or single) page of zpdesc.
+ *
+ * A temporary wrapper to convert struct page to struct zpdesc in situations
+ * where we know the page is the compound head, or single order-0 page.
+ *
+ * Long-term ideally everything would work with struct zpdesc directly or go
+ * through folio to struct zpdesc.
+ *
+ * Return: The zpdesc which contains this page
+ */
+#define page_zpdesc(p) (_Generic((p), \
+ const struct page *: (const struct zpdesc *)(p), \
+ struct page *: (struct zpdesc *)(p)))
+
+static inline void zpdesc_lock(struct zpdesc *zpdesc)
+{
+ folio_lock(zpdesc_folio(zpdesc));
+}
+
+static inline bool zpdesc_trylock(struct zpdesc *zpdesc)
+{
+ return folio_trylock(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_unlock(struct zpdesc *zpdesc)
+{
+ folio_unlock(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_wait_locked(struct zpdesc *zpdesc)
+{
+ folio_wait_locked(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_get(struct zpdesc *zpdesc)
+{
+ folio_get(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_put(struct zpdesc *zpdesc)
+{
+ folio_put(zpdesc_folio(zpdesc));
+}
+
+static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc)
+{
+ return kmap_local_page(zpdesc_page(zpdesc));
+}
+
+static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc)
+{
+ return page_to_pfn(zpdesc_page(zpdesc));
+}
+
+static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
+{
+ return page_zpdesc(pfn_to_page(pfn));
+}
+
+static inline void __zpdesc_set_movable(struct zpdesc *zpdesc)
+{
+ SetPageMovableOps(zpdesc_page(zpdesc));
+}
+
+static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
+{
+ __SetPageZsmalloc(zpdesc_page(zpdesc));
+}
+
+static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc)
+{
+ return page_zone(zpdesc_page(zpdesc));
+}
+
+static inline bool zpdesc_is_locked(struct zpdesc *zpdesc)
+{
+ return folio_test_locked(zpdesc_folio(zpdesc));
+}
+#endif
diff --git a/mm/zpool.c b/mm/zpool.c
index b9fda1fa857d..0a71d03369f1 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -95,7 +95,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_has_pool() - Check if the pool driver is available
- * @type: The type of the zpool to check (e.g. zbud, zsmalloc)
+ * @type: The type of the zpool to check (e.g. zsmalloc)
*
* This checks if the @type pool driver is available. This will try to load
* the requested module, if needed, but there is no guarantee the module will
@@ -130,7 +130,7 @@ EXPORT_SYMBOL(zpool_has_pool);
/**
* zpool_create_pool() - Create a new zpool
- * @type: The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @type: The type of the zpool to create (e.g. zsmalloc)
* @name: The name of the zpool (e.g. zram0, zswap)
* @gfp: The GFP flags to use when allocating the pool.
*
@@ -221,41 +221,27 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
- * zpool_malloc_support_movable() - Check if the zpool supports
- * allocating movable memory
- * @zpool: The zpool to check
- *
- * This returns if the zpool supports allocating movable memory.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: true if the zpool supports allocating movable memory, false if not
- */
-bool zpool_malloc_support_movable(struct zpool *zpool)
-{
- return zpool->driver->malloc_support_movable;
-}
-
-/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
* @size: The amount of memory to allocate.
* @gfp: The GFP flags to use when allocating memory.
* @handle: Pointer to the handle to set
+ * @nid: The preferred node id.
*
* This allocates the requested amount of memory from the pool.
* The gfp flags will be used when allocating memory, if the
* implementation supports it. The provided @handle will be
- * set to the allocated object handle.
+ * set to the allocated object handle. The allocation will
+ * prefer the NUMA node specified by @nid.
*
* Implementations must guarantee this to be thread-safe.
*
* Returns: 0 on success, negative value on error.
*/
int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+ return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid);
}
/**
@@ -278,46 +264,51 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_map_handle() - Map a previously allocated handle into memory
+ * zpool_obj_read_begin() - Start reading from a previously allocated handle.
* @zpool: The zpool that the handle was allocated from
- * @handle: The handle to map
- * @mapmode: How the memory should be mapped
+ * @handle: The handle to read from
+ * @local_copy: A local buffer to use if needed.
*
- * This maps a previously allocated handle into memory. The @mapmode
- * param indicates to the implementation how the memory will be
- * used, i.e. read-only, write-only, read-write. If the
- * implementation does not support it, the memory will be treated
- * as read-write.
+ * This starts a read operation of a previously allocated handle. The passed
+ * @local_copy buffer may be used if needed by copying the memory into.
+ * zpool_obj_read_end() MUST be called after the read is completed to undo any
+ * actions taken (e.g. release locks).
*
- * This may hold locks, disable interrupts, and/or preemption,
- * and the zpool_unmap_handle() must be called to undo those
- * actions. The code that uses the mapped handle should complete
- * its operations on the mapped handle memory quickly and unmap
- * as soon as possible. As the implementation may use per-cpu
- * data, multiple handles should not be mapped concurrently on
- * any cpu.
+ * Returns: A pointer to the handle memory to be read, if @local_copy is used,
+ * the returned pointer is @local_copy.
+ */
+void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
+ void *local_copy)
+{
+ return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy);
+}
+
+/**
+ * zpool_obj_read_end() - Finish reading from a previously allocated handle.
+ * @zpool: The zpool that the handle was allocated from
+ * @handle: The handle to read from
+ * @handle_mem: The pointer returned by zpool_obj_read_begin()
*
- * Returns: A pointer to the handle's mapped memory area.
+ * Finishes a read operation previously started by zpool_obj_read_begin().
*/
-void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
- enum zpool_mapmode mapmode)
+void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
+ void *handle_mem)
{
- return zpool->driver->map(zpool->pool, handle, mapmode);
+ zpool->driver->obj_read_end(zpool->pool, handle, handle_mem);
}
/**
- * zpool_unmap_handle() - Unmap a previously mapped handle
+ * zpool_obj_write() - Write to a previously allocated handle.
* @zpool: The zpool that the handle was allocated from
- * @handle: The handle to unmap
+ * @handle: The handle to read from
+ * @handle_mem: The memory to copy from into the handle.
+ * @mem_len: The length of memory to be written.
*
- * This unmaps a previously mapped handle. Any locks or other
- * actions that the implementation took in zpool_map_handle()
- * will be undone here. The memory area returned from
- * zpool_map_handle() should no longer be used after this.
*/
-void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
+void zpool_obj_write(struct zpool *zpool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
{
- zpool->driver->unmap(zpool->pool, handle);
+ zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len);
}
/**
@@ -333,23 +324,5 @@ u64 zpool_get_total_pages(struct zpool *zpool)
return zpool->driver->total_pages(zpool->pool);
}
-/**
- * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
- * @zpool: The zpool to test
- *
- * Some allocators enter non-preemptible context in ->map() callback (e.g.
- * disable pagefaults) and exit that context in ->unmap(), which limits what
- * we can do with the mapped object. For instance, we cannot wait for
- * asynchronous crypto API to decompress such an object or take mutexes
- * since those will call into the scheduler. This function tells us whether
- * we use such an allocator.
- *
- * Returns: true if zpool can sleep; false otherwise.
- */
-bool zpool_can_sleep_mapped(struct zpool *zpool)
-{
- return zpool->driver->sleep_mapped;
-}
-
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 64b66a4d3e6e..153783d49d34 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -13,30 +13,12 @@
* Released under the terms of GNU General Public License Version 2.0
*/
-/*
- * Following is how we use various fields and flags of underlying
- * struct page(s) to form a zspage.
- *
- * Usage of struct page fields:
- * page->private: points to zspage
- * page->index: links together all component pages of a zspage
- * For the huge page, this is always 0, so we use this field
- * to store handle.
- * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object
- * offset in a subpage of a zspage
- *
- * Usage of struct page flags:
- * PG_private: identifies the first component page
- * PG_owner_priv_1: identifies the huge component page
- *
- */
-
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/*
* lock ordering:
* page_lock
- * pool->migrate_lock
+ * pool->lock
* class->lock
* zspage->lock
*/
@@ -44,17 +26,10 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/vmalloc.h>
-#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/sprintf.h>
#include <linux/shrinker.h>
@@ -62,11 +37,9 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
-#include <linux/migrate.h>
-#include <linux/wait.h>
-#include <linux/pagemap.h>
#include <linux/fs.h>
-#include <linux/local_lock.h>
+#include <linux/workqueue.h>
+#include "zpdesc.h"
#define ZSPAGE_MAGIC 0x58
@@ -240,11 +213,50 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct work_struct free_work;
#endif
- /* protect page/zspage migration */
- rwlock_t migrate_lock;
+ /* protect zspage migration/compaction */
+ rwlock_t lock;
atomic_t compaction_in_progress;
};
+static inline void zpdesc_set_first(struct zpdesc *zpdesc)
+{
+ SetPagePrivate(zpdesc_page(zpdesc));
+}
+
+static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc)
+{
+ inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
+}
+
+static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
+{
+ dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
+}
+
+static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid)
+{
+ struct page *page = alloc_pages_node(nid, gfp, 0);
+
+ return page_zpdesc(page);
+}
+
+static inline void free_zpdesc(struct zpdesc *zpdesc)
+{
+ struct page *page = zpdesc_page(zpdesc);
+
+ /* PageZsmalloc is sticky until the page is freed to the buddy. */
+ __free_page(page);
+}
+
+#define ZS_PAGE_UNLOCKED 0
+#define ZS_PAGE_WRLOCKED -1
+
+struct zspage_lock {
+ spinlock_t lock;
+ int cnt;
+ struct lockdep_map dep_map;
+};
+
struct zspage {
struct {
unsigned int huge:HUGE_BITS;
@@ -254,18 +266,89 @@ struct zspage {
};
unsigned int inuse;
unsigned int freeobj;
- struct page *first_page;
+ struct zpdesc *first_zpdesc;
struct list_head list; /* fullness list */
struct zs_pool *pool;
- rwlock_t lock;
+ struct zspage_lock zsl;
};
-struct mapping_area {
- local_lock_t lock;
- char *vm_buf; /* copy buffer for objects that span pages */
- char *vm_addr; /* address of kmap_local_page()'ed pages */
- enum zs_mapmode vm_mm; /* mapping mode */
-};
+static void zspage_lock_init(struct zspage *zspage)
+{
+ static struct lock_class_key __key;
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0);
+ spin_lock_init(&zsl->lock);
+ zsl->cnt = ZS_PAGE_UNLOCKED;
+}
+
+/*
+ * The zspage lock can be held from atomic contexts, but it needs to remain
+ * preemptible when held for reading because it remains held outside of those
+ * atomic contexts, otherwise we unnecessarily lose preemptibility.
+ *
+ * To achieve this, the following rules are enforced on readers and writers:
+ *
+ * - Writers are blocked by both writers and readers, while readers are only
+ * blocked by writers (i.e. normal rwlock semantics).
+ *
+ * - Writers are always atomic (to allow readers to spin waiting for them).
+ *
+ * - Writers always use trylock (as the lock may be held be sleeping readers).
+ *
+ * - Readers may spin on the lock (as they can only wait for atomic writers).
+ *
+ * - Readers may sleep while holding the lock (as writes only use trylock).
+ */
+static void zspage_read_lock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_);
+
+ spin_lock(&zsl->lock);
+ zsl->cnt++;
+ spin_unlock(&zsl->lock);
+
+ lock_acquired(&zsl->dep_map, _RET_IP_);
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_release(&zsl->dep_map, _RET_IP_);
+
+ spin_lock(&zsl->lock);
+ zsl->cnt--;
+ spin_unlock(&zsl->lock);
+}
+
+static __must_check bool zspage_write_trylock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ spin_lock(&zsl->lock);
+ if (zsl->cnt == ZS_PAGE_UNLOCKED) {
+ zsl->cnt = ZS_PAGE_WRLOCKED;
+ rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_);
+ lock_acquired(&zsl->dep_map, _RET_IP_);
+ return true;
+ }
+
+ spin_unlock(&zsl->lock);
+ return false;
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_release(&zsl->dep_map, _RET_IP_);
+
+ zsl->cnt = ZS_PAGE_UNLOCKED;
+ spin_unlock(&zsl->lock);
+}
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
static void SetZsHugePage(struct zspage *zspage)
@@ -278,12 +361,6 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-static void migrate_lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
-
#ifdef CONFIG_COMPACTION
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
@@ -376,9 +453,9 @@ static void zs_zpool_destroy(void *pool)
}
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- *handle = zs_malloc(pool, size, gfp);
+ *handle = zs_malloc(pool, size, gfp, nid);
if (IS_ERR_VALUE(*handle))
return PTR_ERR((void *)*handle);
@@ -389,29 +466,22 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static void *zs_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
+static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle,
+ void *local_copy)
{
- enum zs_mapmode zs_mm;
-
- switch (mm) {
- case ZPOOL_MM_RO:
- zs_mm = ZS_MM_RO;
- break;
- case ZPOOL_MM_WO:
- zs_mm = ZS_MM_WO;
- break;
- case ZPOOL_MM_RW:
- default:
- zs_mm = ZS_MM_RW;
- break;
- }
+ return zs_obj_read_begin(pool, handle, local_copy);
+}
- return zs_map_object(pool, handle, zs_mm);
+static void zs_zpool_obj_read_end(void *pool, unsigned long handle,
+ void *handle_mem)
+{
+ zs_obj_read_end(pool, handle, handle_mem);
}
-static void zs_zpool_unmap(void *pool, unsigned long handle)
+
+static void zs_zpool_obj_write(void *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
{
- zs_unmap_object(pool, handle);
+ zs_obj_write(pool, handle, handle_mem, mem_len);
}
static u64 zs_zpool_total_pages(void *pool)
@@ -424,25 +494,20 @@ static struct zpool_driver zs_zpool_driver = {
.owner = THIS_MODULE,
.create = zs_zpool_create,
.destroy = zs_zpool_destroy,
- .malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
+ .obj_read_begin = zs_zpool_obj_read_begin,
+ .obj_read_end = zs_zpool_obj_read_end,
+ .obj_write = zs_zpool_obj_write,
.total_pages = zs_zpool_total_pages,
};
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
-/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
-
-static __maybe_unused int is_first_page(struct page *page)
+static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
{
- return PagePrivate(page);
+ return PagePrivate(zpdesc_page(zpdesc));
}
/* Protected by class->lock */
@@ -451,36 +516,35 @@ static inline int get_zspage_inuse(struct zspage *zspage)
return zspage->inuse;
}
-
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
{
zspage->inuse += val;
}
-static inline struct page *get_first_page(struct zspage *zspage)
+static struct zpdesc *get_first_zpdesc(struct zspage *zspage)
{
- struct page *first_page = zspage->first_page;
+ struct zpdesc *first_zpdesc = zspage->first_zpdesc;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- return first_page;
+ VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc));
+ return first_zpdesc;
}
#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff
-static inline unsigned int get_first_obj_offset(struct page *page)
+static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc)
{
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
- return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK;
+ VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
+ return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
-static inline void set_first_obj_offset(struct page *page, unsigned int offset)
+static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset)
{
/* With 24 bits available, we can support offsets into 16 MiB pages. */
BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
+ VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
- page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK;
- page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
+ zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK;
+ zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -733,52 +797,52 @@ out:
return newfg;
}
-static struct zspage *get_zspage(struct page *page)
+static struct zspage *get_zspage(struct zpdesc *zpdesc)
{
- struct zspage *zspage = (struct zspage *)page_private(page);
+ struct zspage *zspage = zpdesc->zspage;
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
return zspage;
}
-static struct page *get_next_page(struct page *page)
+static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc)
{
- struct zspage *zspage = get_zspage(page);
+ struct zspage *zspage = get_zspage(zpdesc);
if (unlikely(ZsHugePage(zspage)))
return NULL;
- return (struct page *)page->index;
+ return zpdesc->next;
}
/**
- * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value
* @obj: the encoded object value
- * @page: page object resides in zspage
+ * @zpdesc: zpdesc object resides in zspage
* @obj_idx: object index
*/
-static void obj_to_location(unsigned long obj, struct page **page,
+static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc,
unsigned int *obj_idx)
{
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
*obj_idx = (obj & OBJ_INDEX_MASK);
}
-static void obj_to_page(unsigned long obj, struct page **page)
+static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc)
{
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
}
/**
- * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
- * @page: page object resides in zspage
+ * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>)
+ * @zpdesc: zpdesc object resides in zspage
* @obj_idx: object index
*/
-static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
+static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx)
{
unsigned long obj;
- obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS;
obj |= obj_idx & OBJ_INDEX_MASK;
return obj;
@@ -789,15 +853,15 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static inline bool obj_allocated(struct page *page, void *obj,
+static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj,
unsigned long *phandle)
{
unsigned long handle;
- struct zspage *zspage = get_zspage(page);
+ struct zspage *zspage = get_zspage(zpdesc);
if (unlikely(ZsHugePage(zspage))) {
- VM_BUG_ON_PAGE(!is_first_page(page), page);
- handle = page->index;
+ VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc));
+ handle = zpdesc->handle;
} else
handle = *(unsigned long *)obj;
@@ -809,22 +873,23 @@ static inline bool obj_allocated(struct page *page, void *obj,
return true;
}
-static void reset_page(struct page *page)
+static void reset_zpdesc(struct zpdesc *zpdesc)
{
- __ClearPageMovable(page);
+ struct page *page = zpdesc_page(zpdesc);
+
ClearPagePrivate(page);
- set_page_private(page, 0);
- page->index = 0;
- __ClearPageZsmalloc(page);
+ zpdesc->zspage = NULL;
+ zpdesc->next = NULL;
+ /* PageZsmalloc is sticky until the page is freed to the buddy. */
}
static int trylock_zspage(struct zspage *zspage)
{
- struct page *cursor, *fail;
+ struct zpdesc *cursor, *fail;
- for (cursor = get_first_page(zspage); cursor != NULL; cursor =
- get_next_page(cursor)) {
- if (!trylock_page(cursor)) {
+ for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor =
+ get_next_zpdesc(cursor)) {
+ if (!zpdesc_trylock(cursor)) {
fail = cursor;
goto unlock;
}
@@ -832,9 +897,9 @@ static int trylock_zspage(struct zspage *zspage)
return 1;
unlock:
- for (cursor = get_first_page(zspage); cursor != fail; cursor =
- get_next_page(cursor))
- unlock_page(cursor);
+ for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor =
+ get_next_zpdesc(cursor))
+ zpdesc_unlock(cursor);
return 0;
}
@@ -842,23 +907,23 @@ unlock:
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
- struct page *page, *next;
+ struct zpdesc *zpdesc, *next;
assert_spin_locked(&class->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
- next = page = get_first_page(zspage);
+ next = zpdesc = get_first_zpdesc(zspage);
do {
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- next = get_next_page(page);
- reset_page(page);
- unlock_page(page);
- dec_zone_page_state(page, NR_ZSPAGES);
- put_page(page);
- page = next;
- } while (page != NULL);
+ VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc));
+ next = get_next_zpdesc(zpdesc);
+ reset_zpdesc(zpdesc);
+ zpdesc_unlock(zpdesc);
+ zpdesc_dec_zone_page_state(zpdesc);
+ zpdesc_put(zpdesc);
+ zpdesc = next;
+ } while (zpdesc != NULL);
cache_free_zspage(pool, zspage);
@@ -891,16 +956,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
{
unsigned int freeobj = 1;
unsigned long off = 0;
- struct page *page = get_first_page(zspage);
+ struct zpdesc *zpdesc = get_first_zpdesc(zspage);
- while (page) {
- struct page *next_page;
+ while (zpdesc) {
+ struct zpdesc *next_zpdesc;
struct link_free *link;
void *vaddr;
- set_first_obj_offset(page, off);
+ set_first_obj_offset(zpdesc, off);
- vaddr = kmap_local_page(page);
+ vaddr = kmap_local_zpdesc(zpdesc);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
@@ -913,8 +978,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
* page, which must point to the first object on the next
* page (if present)
*/
- next_page = get_next_page(page);
- if (next_page) {
+ next_zpdesc = get_next_zpdesc(zpdesc);
+ if (next_zpdesc) {
link->next = freeobj++ << OBJ_TAG_BITS;
} else {
/*
@@ -924,7 +989,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
link->next = -1UL << OBJ_TAG_BITS;
}
kunmap_local(vaddr);
- page = next_page;
+ zpdesc = next_zpdesc;
off %= PAGE_SIZE;
}
@@ -932,35 +997,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
}
static void create_page_chain(struct size_class *class, struct zspage *zspage,
- struct page *pages[])
+ struct zpdesc *zpdescs[])
{
int i;
- struct page *page;
- struct page *prev_page = NULL;
- int nr_pages = class->pages_per_zspage;
+ struct zpdesc *zpdesc;
+ struct zpdesc *prev_zpdesc = NULL;
+ int nr_zpdescs = class->pages_per_zspage;
/*
* Allocate individual pages and link them together as:
- * 1. all pages are linked together using page->index
- * 2. each sub-page point to zspage using page->private
+ * 1. all pages are linked together using zpdesc->next
+ * 2. each sub-page point to zspage using zpdesc->zspage
*
- * we set PG_private to identify the first page (i.e. no other sub-page
+ * we set PG_private to identify the first zpdesc (i.e. no other zpdesc
* has this flag set).
*/
- for (i = 0; i < nr_pages; i++) {
- page = pages[i];
- set_page_private(page, (unsigned long)zspage);
- page->index = 0;
+ for (i = 0; i < nr_zpdescs; i++) {
+ zpdesc = zpdescs[i];
+ zpdesc->zspage = zspage;
+ zpdesc->next = NULL;
if (i == 0) {
- zspage->first_page = page;
- SetPagePrivate(page);
+ zspage->first_zpdesc = zpdesc;
+ zpdesc_set_first(zpdesc);
if (unlikely(class->objs_per_zspage == 1 &&
class->pages_per_zspage == 1))
SetZsHugePage(zspage);
} else {
- prev_page->index = (unsigned long)page;
+ prev_zpdesc->next = zpdesc;
}
- prev_page = page;
+ prev_zpdesc = zpdesc;
}
}
@@ -968,42 +1033,44 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
* Allocate a zspage for the given size class
*/
static struct zspage *alloc_zspage(struct zs_pool *pool,
- struct size_class *class,
- gfp_t gfp)
+ struct size_class *class,
+ gfp_t gfp, const int nid)
{
int i;
- struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+ struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
struct zspage *zspage = cache_alloc_zspage(pool, gfp);
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
- migrate_lock_init(zspage);
+ zspage->pool = pool;
+ zspage->class = class->index;
+ zspage_lock_init(zspage);
for (i = 0; i < class->pages_per_zspage; i++) {
- struct page *page;
+ struct zpdesc *zpdesc;
- page = alloc_page(gfp);
- if (!page) {
+ zpdesc = alloc_zpdesc(gfp, nid);
+ if (!zpdesc) {
while (--i >= 0) {
- dec_zone_page_state(pages[i], NR_ZSPAGES);
- __ClearPageZsmalloc(pages[i]);
- __free_page(pages[i]);
+ zpdesc_dec_zone_page_state(zpdescs[i]);
+ free_zpdesc(zpdescs[i]);
}
cache_free_zspage(pool, zspage);
return NULL;
}
- __SetPageZsmalloc(page);
+ __zpdesc_set_zsmalloc(zpdesc);
- inc_zone_page_state(page, NR_ZSPAGES);
- pages[i] = page;
+ zpdesc_inc_zone_page_state(zpdesc);
+ zpdescs[i] = zpdesc;
}
- create_page_chain(class, zspage, pages);
+ create_page_chain(class, zspage, zpdescs);
init_zspage(class, zspage);
- zspage->pool = pool;
- zspage->class = class->index;
return zspage;
}
@@ -1023,93 +1090,6 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
- /*
- * Make sure we don't leak memory if a cpu UP notification
- * and zs_init() race and both call zs_cpu_up() on the same cpu
- */
- if (area->vm_buf)
- return 0;
- area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
- if (!area->vm_buf)
- return -ENOMEM;
- return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
- kfree(area->vm_buf);
- area->vm_buf = NULL;
-}
-
-static void *__zs_map_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- size_t sizes[2];
- char *buf = area->vm_buf;
-
- /* disable page faults to match kmap_local_page() return conditions */
- pagefault_disable();
-
- /* no read fastpath */
- if (area->vm_mm == ZS_MM_WO)
- goto out;
-
- sizes[0] = PAGE_SIZE - off;
- sizes[1] = size - sizes[0];
-
- /* copy object to per-cpu buffer */
- memcpy_from_page(buf, pages[0], off, sizes[0]);
- memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]);
-out:
- return area->vm_buf;
-}
-
-static void __zs_unmap_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- size_t sizes[2];
- char *buf;
-
- /* no write fastpath */
- if (area->vm_mm == ZS_MM_RO)
- goto out;
-
- buf = area->vm_buf;
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
-
- sizes[0] = PAGE_SIZE - off;
- sizes[1] = size - sizes[0];
-
- /* copy per-cpu buffer to object */
- memcpy_to_page(pages[0], off, buf, sizes[0]);
- memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]);
-
-out:
- /* enable page faults to match kunmap_local() return conditions */
- pagefault_enable();
-}
-
-static int zs_cpu_prepare(unsigned int cpu)
-{
- struct mapping_area *area;
-
- area = &per_cpu(zs_map_area, cpu);
- return __zs_cpu_up(area);
-}
-
-static int zs_cpu_dead(unsigned int cpu)
-{
- struct mapping_area *area;
-
- area = &per_cpu(zs_map_area, cpu);
- __zs_cpu_down(area);
- return 0;
-}
-
static bool can_merge(struct size_class *prev, int pages_per_zspage,
int objs_per_zspage)
{
@@ -1157,116 +1137,130 @@ unsigned long zs_get_total_pages(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_get_total_pages);
-/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- * @mm: mapping mode to use
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
- *
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
- *
- * This function returns with preemption and page faults disabled.
- */
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
+ void *local_copy)
{
struct zspage *zspage;
- struct page *page;
+ struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
-
struct size_class *class;
- struct mapping_area *area;
- struct page *pages[2];
- void *ret;
-
- /*
- * Because we use per-cpu mapping areas shared among the
- * pools/users, we can't allow mapping in interrupt context
- * because it can corrupt another users mappings.
- */
- BUG_ON(in_interrupt());
+ void *addr;
- /* It guarantees it can get zspage from handle safely */
- read_lock(&pool->migrate_lock);
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
obj = handle_to_obj(handle);
- obj_to_location(obj, &page, &obj_idx);
- zspage = get_zspage(page);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
- /*
- * migration cannot move any zpages in this zspage. Here, class->lock
- * is too heavy since callers would take some time until they calls
- * zs_unmap_object API so delegate the locking from class to zspage
- * which is smaller granularity.
- */
- migrate_read_lock(zspage);
- read_unlock(&pool->migrate_lock);
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- local_lock(&zs_map_area.lock);
- area = this_cpu_ptr(&zs_map_area);
- area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
- area->vm_addr = kmap_local_page(page);
- ret = area->vm_addr + off;
- goto out;
+ addr = kmap_local_zpdesc(zpdesc);
+ addr += off;
+ } else {
+ size_t sizes[2];
+
+ /* this object spans two pages */
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = class->size - sizes[0];
+ addr = local_copy;
+
+ memcpy_from_page(addr, zpdesc_page(zpdesc),
+ off, sizes[0]);
+ zpdesc = get_next_zpdesc(zpdesc);
+ memcpy_from_page(addr + sizes[0],
+ zpdesc_page(zpdesc),
+ 0, sizes[1]);
}
- /* this object spans two pages */
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
-
- ret = __zs_map_object(area, pages, off, class->size);
-out:
- if (likely(!ZsHugePage(zspage)))
- ret += ZS_HANDLE_SIZE;
+ if (!ZsHugePage(zspage))
+ addr += ZS_HANDLE_SIZE;
- return ret;
+ return addr;
}
-EXPORT_SYMBOL_GPL(zs_map_object);
+EXPORT_SYMBOL_GPL(zs_obj_read_begin);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem)
{
struct zspage *zspage;
- struct page *page;
+ struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
-
struct size_class *class;
- struct mapping_area *area;
obj = handle_to_obj(handle);
- obj_to_location(obj, &page, &obj_idx);
- zspage = get_zspage(page);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- area = this_cpu_ptr(&zs_map_area);
- if (off + class->size <= PAGE_SIZE)
- kunmap_local(area->vm_addr);
- else {
- struct page *pages[2];
+ if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+ handle_mem -= off;
+ kunmap_local(handle_mem);
+ }
+
+ zspage_read_unlock(zspage);
+}
+EXPORT_SYMBOL_GPL(zs_obj_read_end);
+
+void zs_obj_write(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
+{
+ struct zspage *zspage;
+ struct zpdesc *zpdesc;
+ unsigned long obj, off;
+ unsigned int obj_idx;
+ struct size_class *class;
+
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
- __zs_unmap_object(area, pages, off, class->size);
+ class = zspage_class(pool, zspage);
+ off = offset_in_page(class->size * obj_idx);
+
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ void *dst = kmap_local_zpdesc(zpdesc);
+
+ memcpy(dst + off, handle_mem, mem_len);
+ kunmap_local(dst);
+ } else {
+ /* this object spans two pages */
+ size_t sizes[2];
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = mem_len - sizes[0];
+
+ memcpy_to_page(zpdesc_page(zpdesc), off,
+ handle_mem, sizes[0]);
+ zpdesc = get_next_zpdesc(zpdesc);
+ memcpy_to_page(zpdesc_page(zpdesc), 0,
+ handle_mem + sizes[0], sizes[1]);
}
- local_unlock(&zs_map_area.lock);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
+EXPORT_SYMBOL_GPL(zs_obj_write);
/**
* zs_huge_class_size() - Returns the size (in bytes) of the first huge
@@ -1290,12 +1284,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size);
static unsigned long obj_malloc(struct zs_pool *pool,
struct zspage *zspage, unsigned long handle)
{
- int i, nr_page, offset;
+ int i, nr_zpdesc, offset;
unsigned long obj;
struct link_free *link;
struct size_class *class;
- struct page *m_page;
+ struct zpdesc *m_zpdesc;
unsigned long m_offset;
void *vaddr;
@@ -1303,27 +1297,26 @@ static unsigned long obj_malloc(struct zs_pool *pool,
obj = get_freeobj(zspage);
offset = obj * class->size;
- nr_page = offset >> PAGE_SHIFT;
+ nr_zpdesc = offset >> PAGE_SHIFT;
m_offset = offset_in_page(offset);
- m_page = get_first_page(zspage);
+ m_zpdesc = get_first_zpdesc(zspage);
- for (i = 0; i < nr_page; i++)
- m_page = get_next_page(m_page);
+ for (i = 0; i < nr_zpdesc; i++)
+ m_zpdesc = get_next_zpdesc(m_zpdesc);
- vaddr = kmap_local_page(m_page);
+ vaddr = kmap_local_zpdesc(m_zpdesc);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
if (likely(!ZsHugePage(zspage)))
/* record handle in the header of allocated chunk */
link->handle = handle | OBJ_ALLOCATED_TAG;
else
- /* record handle to page->index */
- zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
+ zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG;
kunmap_local(vaddr);
mod_zspage_inuse(zspage, 1);
- obj = location_to_obj(m_page, obj);
+ obj = location_to_obj(m_zpdesc, obj);
record_obj(handle, obj);
return obj;
@@ -1335,12 +1328,14 @@ static unsigned long obj_malloc(struct zs_pool *pool,
* @pool: pool to allocate from
* @size: size of block to allocate
* @gfp: gfp flags when allocating object
+ * @nid: The preferred node id to allocate new zspage (if needed)
*
* On success, handle to the allocated object is returned,
* otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
+ const int nid)
{
unsigned long handle;
struct size_class *class;
@@ -1375,7 +1370,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
spin_unlock(&class->lock);
- zspage = alloc_zspage(pool, class, gfp);
+ zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
cache_free_handle(pool, handle);
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1402,23 +1397,24 @@ static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
- struct page *f_page;
+ struct zpdesc *f_zpdesc;
unsigned long f_offset;
unsigned int f_objidx;
void *vaddr;
- obj_to_location(obj, &f_page, &f_objidx);
+
+ obj_to_location(obj, &f_zpdesc, &f_objidx);
f_offset = offset_in_page(class_size * f_objidx);
- zspage = get_zspage(f_page);
+ zspage = get_zspage(f_zpdesc);
- vaddr = kmap_local_page(f_page);
+ vaddr = kmap_local_zpdesc(f_zpdesc);
link = (struct link_free *)(vaddr + f_offset);
/* Insert this object in containing zspage's freelist */
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
- f_page->index = 0;
+ f_zpdesc->handle = 0;
set_freeobj(zspage, f_objidx);
kunmap_local(vaddr);
@@ -1428,7 +1424,7 @@ static void obj_free(int class_size, unsigned long obj)
void zs_free(struct zs_pool *pool, unsigned long handle)
{
struct zspage *zspage;
- struct page *f_page;
+ struct zpdesc *f_zpdesc;
unsigned long obj;
struct size_class *class;
int fullness;
@@ -1437,16 +1433,16 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
return;
/*
- * The pool->migrate_lock protects the race with zpage's migration
+ * The pool->lock protects the race with zpage's migration
* so it's safe to get the page from handle.
*/
- read_lock(&pool->migrate_lock);
+ read_lock(&pool->lock);
obj = handle_to_obj(handle);
- obj_to_page(obj, &f_page);
- zspage = get_zspage(f_page);
+ obj_to_zpdesc(obj, &f_zpdesc);
+ zspage = get_zspage(f_zpdesc);
class = zspage_class(pool, zspage);
spin_lock(&class->lock);
- read_unlock(&pool->migrate_lock);
+ read_unlock(&pool->lock);
class_stat_sub(class, ZS_OBJS_INUSE, 1);
obj_free(class->size, obj);
@@ -1463,7 +1459,7 @@ EXPORT_SYMBOL_GPL(zs_free);
static void zs_object_copy(struct size_class *class, unsigned long dst,
unsigned long src)
{
- struct page *s_page, *d_page;
+ struct zpdesc *s_zpdesc, *d_zpdesc;
unsigned int s_objidx, d_objidx;
unsigned long s_off, d_off;
void *s_addr, *d_addr;
@@ -1472,8 +1468,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
s_size = d_size = class->size;
- obj_to_location(src, &s_page, &s_objidx);
- obj_to_location(dst, &d_page, &d_objidx);
+ obj_to_location(src, &s_zpdesc, &s_objidx);
+ obj_to_location(dst, &d_zpdesc, &d_objidx);
s_off = offset_in_page(class->size * s_objidx);
d_off = offset_in_page(class->size * d_objidx);
@@ -1484,8 +1480,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (d_off + class->size > PAGE_SIZE)
d_size = PAGE_SIZE - d_off;
- s_addr = kmap_local_page(s_page);
- d_addr = kmap_local_page(d_page);
+ s_addr = kmap_local_zpdesc(s_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
while (1) {
size = min(s_size, d_size);
@@ -1510,17 +1506,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (s_off >= PAGE_SIZE) {
kunmap_local(d_addr);
kunmap_local(s_addr);
- s_page = get_next_page(s_page);
- s_addr = kmap_local_page(s_page);
- d_addr = kmap_local_page(d_page);
+ s_zpdesc = get_next_zpdesc(s_zpdesc);
+ s_addr = kmap_local_zpdesc(s_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
s_size = class->size - written;
s_off = 0;
}
if (d_off >= PAGE_SIZE) {
kunmap_local(d_addr);
- d_page = get_next_page(d_page);
- d_addr = kmap_local_page(d_page);
+ d_zpdesc = get_next_zpdesc(d_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
d_size = class->size - written;
d_off = 0;
}
@@ -1535,18 +1531,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+ struct zpdesc *zpdesc, int *obj_idx)
{
unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
- void *addr = kmap_local_page(page);
+ void *addr = kmap_local_zpdesc(zpdesc);
- offset = get_first_obj_offset(page);
+ offset = get_first_obj_offset(zpdesc);
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_allocated(page, addr + offset, &handle))
+ if (obj_allocated(zpdesc, addr + offset, &handle))
break;
offset += class->size;
@@ -1566,14 +1562,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
unsigned long used_obj, free_obj;
unsigned long handle;
int obj_idx = 0;
- struct page *s_page = get_first_page(src_zspage);
+ struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage);
struct size_class *class = pool->size_class[src_zspage->class];
while (1) {
- handle = find_alloced_obj(class, s_page, &obj_idx);
+ handle = find_alloced_obj(class, s_zpdesc, &obj_idx);
if (!handle) {
- s_page = get_next_page(s_page);
- if (!s_page)
+ s_zpdesc = get_next_zpdesc(s_zpdesc);
+ if (!s_zpdesc)
break;
obj_idx = 0;
continue;
@@ -1653,104 +1649,78 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
*/
static void lock_zspage(struct zspage *zspage)
{
- struct page *curr_page, *page;
+ struct zpdesc *curr_zpdesc, *zpdesc;
/*
* Pages we haven't locked yet can be migrated off the list while we're
* trying to lock them, so we need to be careful and only attempt to
- * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * lock each page under zspage_read_lock(). Otherwise, the page we lock
* may no longer belong to the zspage. This means that we may wait for
* the wrong page to unlock, so we must take a reference to the page
- * prior to waiting for it to unlock outside migrate_read_lock().
+ * prior to waiting for it to unlock outside zspage_read_lock().
*/
while (1) {
- migrate_read_lock(zspage);
- page = get_first_page(zspage);
- if (trylock_page(page))
+ zspage_read_lock(zspage);
+ zpdesc = get_first_zpdesc(zspage);
+ if (zpdesc_trylock(zpdesc))
break;
- get_page(page);
- migrate_read_unlock(zspage);
- wait_on_page_locked(page);
- put_page(page);
+ zpdesc_get(zpdesc);
+ zspage_read_unlock(zspage);
+ zpdesc_wait_locked(zpdesc);
+ zpdesc_put(zpdesc);
}
- curr_page = page;
- while ((page = get_next_page(curr_page))) {
- if (trylock_page(page)) {
- curr_page = page;
+ curr_zpdesc = zpdesc;
+ while ((zpdesc = get_next_zpdesc(curr_zpdesc))) {
+ if (zpdesc_trylock(zpdesc)) {
+ curr_zpdesc = zpdesc;
} else {
- get_page(page);
- migrate_read_unlock(zspage);
- wait_on_page_locked(page);
- put_page(page);
- migrate_read_lock(zspage);
+ zpdesc_get(zpdesc);
+ zspage_read_unlock(zspage);
+ zpdesc_wait_locked(zpdesc);
+ zpdesc_put(zpdesc);
+ zspage_read_lock(zspage);
}
}
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
#endif /* CONFIG_COMPACTION */
-static void migrate_lock_init(struct zspage *zspage)
-{
- rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
- read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
- read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
- write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
- write_unlock(&zspage->lock);
-}
-
#ifdef CONFIG_COMPACTION
-static const struct movable_operations zsmalloc_mops;
-
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
- struct page *newpage, struct page *oldpage)
+ struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
{
- struct page *page;
- struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ struct zpdesc *zpdesc;
+ struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ unsigned int first_obj_offset;
int idx = 0;
- page = get_first_page(zspage);
+ zpdesc = get_first_zpdesc(zspage);
do {
- if (page == oldpage)
- pages[idx] = newpage;
+ if (zpdesc == oldzpdesc)
+ zpdescs[idx] = newzpdesc;
else
- pages[idx] = page;
+ zpdescs[idx] = zpdesc;
idx++;
- } while ((page = get_next_page(page)) != NULL);
+ } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
- create_page_chain(class, zspage, pages);
- set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+ create_page_chain(class, zspage, zpdescs);
+ first_obj_offset = get_first_obj_offset(oldzpdesc);
+ set_first_obj_offset(newzpdesc, first_obj_offset);
if (unlikely(ZsHugePage(zspage)))
- newpage->index = oldpage->index;
- __SetPageMovable(newpage, &zsmalloc_mops);
+ newzpdesc->handle = oldzpdesc->handle;
+ __zpdesc_set_movable(newzpdesc);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
/*
- * Page is locked so zspage couldn't be destroyed. For detail, look at
- * lock_zspage in free_zspage.
+ * Page is locked so zspage can't be destroyed concurrently
+ * (see free_zspage()). But if the page was already destroyed
+ * (see reset_zpdesc()), refuse isolation here.
*/
- VM_BUG_ON_PAGE(PageIsolated(page), page);
-
- return true;
+ return page_zpdesc(page)->zspage;
}
static int zs_page_migrate(struct page *newpage, struct page *page,
@@ -1759,86 +1729,98 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
struct zs_pool *pool;
struct size_class *class;
struct zspage *zspage;
- struct page *dummy;
+ struct zpdesc *dummy;
+ struct zpdesc *newzpdesc = page_zpdesc(newpage);
+ struct zpdesc *zpdesc = page_zpdesc(page);
void *s_addr, *d_addr, *addr;
unsigned int offset;
unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
- /* We're committed, tell the world that this is a Zsmalloc page. */
- __SetPageZsmalloc(newpage);
+ /*
+ * TODO: nothing prevents a zspage from getting destroyed while
+ * it is isolated for migration, as the page lock is temporarily
+ * dropped after zs_page_isolate() succeeded: we should rework that
+ * and defer destroying such pages once they are un-isolated (putback)
+ * instead.
+ */
+ if (!zpdesc->zspage)
+ return 0;
/* The page is locked, so this pointer must remain valid */
- zspage = get_zspage(page);
+ zspage = get_zspage(zpdesc);
pool = zspage->pool;
/*
* The pool migrate_lock protects the race between zpage migration
* and zs_free.
*/
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
class = zspage_class(pool, zspage);
/*
* the class lock protects zpage alloc/free in the zspage.
*/
spin_lock(&class->lock);
- /* the migrate_write_lock protects zpage access via zs_map_object */
- migrate_write_lock(zspage);
+ /* the zspage write_lock protects zpage access via zs_obj_read/write() */
+ if (!zspage_write_trylock(zspage)) {
+ spin_unlock(&class->lock);
+ write_unlock(&pool->lock);
+ return -EINVAL;
+ }
+
+ /* We're committed, tell the world that this is a Zsmalloc page. */
+ __zpdesc_set_zsmalloc(newzpdesc);
- offset = get_first_obj_offset(page);
- s_addr = kmap_local_page(page);
+ offset = get_first_obj_offset(zpdesc);
+ s_addr = kmap_local_zpdesc(zpdesc);
/*
* Here, any user cannot access all objects in the zspage so let's move.
*/
- d_addr = kmap_local_page(newpage);
+ d_addr = kmap_local_zpdesc(newzpdesc);
copy_page(d_addr, s_addr);
kunmap_local(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
- if (obj_allocated(page, addr, &handle)) {
+ if (obj_allocated(zpdesc, addr, &handle)) {
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
- new_obj = (unsigned long)location_to_obj(newpage,
- obj_idx);
+ new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx);
record_obj(handle, new_obj);
}
}
kunmap_local(s_addr);
- replace_sub_page(class, zspage, newpage, page);
+ replace_sub_page(class, zspage, newzpdesc, zpdesc);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release migration_lock.
*/
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
spin_unlock(&class->lock);
- migrate_write_unlock(zspage);
+ zspage_write_unlock(zspage);
- get_page(newpage);
- if (page_zone(newpage) != page_zone(page)) {
- dec_zone_page_state(page, NR_ZSPAGES);
- inc_zone_page_state(newpage, NR_ZSPAGES);
+ zpdesc_get(newzpdesc);
+ if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
+ zpdesc_dec_zone_page_state(zpdesc);
+ zpdesc_inc_zone_page_state(newzpdesc);
}
- reset_page(page);
- put_page(page);
+ reset_zpdesc(zpdesc);
+ zpdesc_put(zpdesc);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
static void zs_page_putback(struct page *page)
{
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
}
-static const struct movable_operations zsmalloc_mops = {
+const struct movable_operations zsmalloc_mops = {
.isolate_page = zs_page_isolate,
.migrate_page = zs_page_migrate,
.putback_page = zs_page_putback,
@@ -1897,13 +1879,13 @@ static void init_deferred_free(struct zs_pool *pool)
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
{
- struct page *page = get_first_page(zspage);
+ struct zpdesc *zpdesc = get_first_zpdesc(zspage);
do {
- WARN_ON(!trylock_page(page));
- __SetPageMovable(page, &zsmalloc_mops);
- unlock_page(page);
- } while ((page = get_next_page(page)) != NULL);
+ WARN_ON(!zpdesc_trylock(zpdesc));
+ __zpdesc_set_movable(zpdesc);
+ zpdesc_unlock(zpdesc);
+ } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
}
#else
static inline void zs_flush_migration(struct zs_pool *pool) { }
@@ -1940,7 +1922,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
* protect the race between zpage migration and zs_free
* as well as zpage allocation/free
*/
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
spin_lock(&class->lock);
while (zs_can_compact(class)) {
int fg;
@@ -1955,9 +1937,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!src_zspage)
break;
- migrate_write_lock(src_zspage);
+ if (!zspage_write_trylock(src_zspage))
+ break;
+
migrate_zspage(pool, src_zspage, dst_zspage);
- migrate_write_unlock(src_zspage);
+ zspage_write_unlock(src_zspage);
fg = putback_zspage(class, src_zspage);
if (fg == ZS_INUSE_RATIO_0) {
@@ -1967,14 +1951,14 @@ static unsigned long __zs_compact(struct zs_pool *pool,
src_zspage = NULL;
if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
- || rwlock_is_contended(&pool->migrate_lock)) {
+ || rwlock_is_contended(&pool->lock)) {
putback_zspage(class, dst_zspage);
dst_zspage = NULL;
spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
cond_resched();
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
spin_lock(&class->lock);
}
}
@@ -1986,7 +1970,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(class, dst_zspage);
spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
return pages_freed;
}
@@ -1998,10 +1982,10 @@ unsigned long zs_compact(struct zs_pool *pool)
unsigned long pages_freed = 0;
/*
- * Pool compaction is performed under pool->migrate_lock so it is basically
+ * Pool compaction is performed under pool->lock so it is basically
* single-threaded. Having more than one thread in __zs_compact()
- * will increase pool->migrate_lock contention, which will impact other
- * zsmalloc operations that need pool->migrate_lock.
+ * will increase pool->lock contention, which will impact other
+ * zsmalloc operations that need pool->lock.
*/
if (atomic_xchg(&pool->compaction_in_progress, 1))
return 0;
@@ -2123,7 +2107,7 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- rwlock_init(&pool->migrate_lock);
+ rwlock_init(&pool->lock);
atomic_set(&pool->compaction_in_progress, 0);
pool->name = kstrdup(name, GFP_KERNEL);
@@ -2262,23 +2246,18 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
static int __init zs_init(void)
{
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
- zs_cpu_prepare, zs_cpu_dead);
- if (ret)
- goto out;
+ int rc __maybe_unused;
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
-
+#ifdef CONFIG_COMPACTION
+ rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
+ if (rc)
+ return rc;
+#endif
zs_stat_init();
-
return 0;
-
-out:
- return ret;
}
static void __exit zs_exit(void)
@@ -2286,8 +2265,9 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
- cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
-
+#ifdef CONFIG_COMPACTION
+ set_movable_ops(NULL, PGTY_zsmalloc);
+#endif
zs_stat_exit();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 6e0c0fca5830..3c0fd8a13718 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
static u64 zswap_reject_compress_fail;
/* Compressed page was too big for the allocator to (optimally) store */
static u64 zswap_reject_compress_poor;
+/* Load or writeback failed due to decompression failure */
+static u64 zswap_decompress_fail;
/* Store failed because underlying allocator could not get memory */
static u64 zswap_reject_alloc_fail;
/* Store failed because the entry metadata could not be allocated (rare) */
@@ -881,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct acomp_req *req;
+ struct crypto_acomp *acomp;
+ u8 *buffer;
+
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return 0;
mutex_lock(&acomp_ctx->mutex);
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- acomp_ctx->req = NULL;
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
+ req = acomp_ctx->req;
+ acomp = acomp_ctx->acomp;
+ buffer = acomp_ctx->buffer;
+ acomp_ctx->req = NULL;
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->buffer = NULL;
mutex_unlock(&acomp_ctx->mutex);
+ /*
+ * Do the actual freeing after releasing the mutex to avoid subtle
+ * locking dependencies causing deadlocks.
+ */
+ if (!IS_ERR_OR_NULL(req))
+ acomp_request_free(req);
+ if (!IS_ERR_OR_NULL(acomp))
+ crypto_free_acomp(acomp);
+ kfree(buffer);
+
return 0;
}
@@ -930,7 +946,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
unsigned int dlen = PAGE_SIZE;
unsigned long handle;
struct zpool *zpool;
- char *buf;
gfp_t gfp;
u8 *dst;
@@ -965,17 +980,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
goto unlock;
zpool = pool->zpool;
- gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(zpool))
- gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
+ alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page));
if (alloc_ret)
goto unlock;
- buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, dst, dlen);
- zpool_unmap_handle(zpool, handle);
-
+ zpool_obj_write(zpool, handle, dst, dlen);
entry->handle = handle;
entry->length = dlen;
@@ -991,41 +1001,49 @@ unlock:
return comp_ret == 0 && alloc_ret == 0;
}
-static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
+static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
{
struct zpool *zpool = entry->pool->zpool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src;
+ int decomp_ret, dlen;
+ u8 *src, *obj;
acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
- src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+ obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+
/*
- * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
- * to do crypto_acomp_decompress() which might sleep. In such cases, we must
- * resort to copying the buffer to a temporary one.
- * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer,
- * such as a kmap address of high memory or even ever a vmap address.
- * However, sg_init_one is only equipped to handle linearly mapped low memory.
- * In such cases, we also must copy the buffer to a temporary and lowmem one.
+ * zpool_obj_read_begin() might return a kmap address of highmem when
+ * acomp_ctx->buffer is not used. However, sg_init_one() does not
+ * handle highmem addresses, so copy the object to acomp_ctx->buffer.
*/
- if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
- !virt_addr_valid(src)) {
- memcpy(acomp_ctx->buffer, src, entry->length);
+ if (virt_addr_valid(obj)) {
+ src = obj;
+ } else {
+ WARN_ON_ONCE(obj == acomp_ctx->buffer);
+ memcpy(acomp_ctx->buffer, obj, entry->length);
src = acomp_ctx->buffer;
- zpool_unmap_handle(zpool, entry->handle);
}
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
sg_set_folio(&output, folio, PAGE_SIZE, 0);
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
- BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
- BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+ decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
- if (src != acomp_ctx->buffer)
- zpool_unmap_handle(zpool, entry->handle);
+ zpool_obj_read_end(zpool, entry->handle, obj);
acomp_ctx_put_unlock(acomp_ctx);
+
+ if (!decomp_ret && dlen == PAGE_SIZE)
+ return true;
+
+ zswap_decompress_fail++;
+ pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n",
+ swp_type(entry->swpentry),
+ swp_offset(entry->swpentry),
+ entry->pool->tfm_name, entry->length, dlen);
+ return false;
}
/*********************************
@@ -1051,14 +1069,18 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
+ struct swap_info_struct *si;
+ int ret = 0;
/* try to allocate swap cache folio */
+ si = get_swap_device(swpentry);
+ if (!si)
+ return -EEXIST;
+
mpol = get_task_policy(current);
folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ put_swap_device(si);
if (!folio)
return -ENOMEM;
@@ -1070,8 +1092,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* and freed when invalidated by the concurrent shrinker anyway.
*/
if (!folio_was_allocated) {
- folio_put(folio);
- return -EEXIST;
+ ret = -EEXIST;
+ goto out;
}
/*
@@ -1084,14 +1106,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* be dereferenced.
*/
tree = swap_zswap_tree(swpentry);
- if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
- delete_from_swap_cache(folio);
- folio_unlock(folio);
- folio_put(folio);
- return -ENOMEM;
+ if (entry != xa_load(tree, offset)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!zswap_decompress(entry, folio)) {
+ ret = -EIO;
+ goto out;
}
- zswap_decompress(entry, folio);
+ xa_erase(tree, offset);
count_vm_event(ZSWPWB);
if (entry->objcg)
@@ -1106,10 +1131,15 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
folio_set_reclaim(folio);
/* start writeback */
- __swap_writepage(folio, &wbc);
- folio_put(folio);
+ __swap_writepage(folio, NULL);
- return 0;
+out:
+ if (ret && ret != -EEXIST) {
+ delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+ return ret;
}
/*********************************
@@ -1192,7 +1222,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
/*
* It's safe to drop the lock here because we return either
- * LRU_REMOVED_RETRY or LRU_RETRY.
+ * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP.
*/
spin_unlock(&l->lock);
@@ -1609,7 +1639,27 @@ check_old:
return ret;
}
-bool zswap_load(struct folio *folio)
+/**
+ * zswap_load() - load a folio from zswap
+ * @folio: folio to load
+ *
+ * Return: 0 on success, with the folio unlocked and marked up-to-date, or one
+ * of the following error codes:
+ *
+ * -EIO: if the swapped out content was in zswap, but could not be loaded
+ * into the page due to a decompression failure. The folio is unlocked, but
+ * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
+ * will SIGBUS).
+ *
+ * -EINVAL: if the swapped out content was in zswap, but the page belongs
+ * to a large folio, which is not supported by zswap. The folio is unlocked,
+ * but NOT marked up-to-date, so that an IO error is emitted (e.g.
+ * do_swap_page() will SIGBUS).
+ *
+ * -ENOENT: if the swapped out content was not in zswap. The folio remains
+ * locked on return.
+ */
+int zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
pgoff_t offset = swp_offset(swp);
@@ -1620,18 +1670,32 @@ bool zswap_load(struct folio *folio)
VM_WARN_ON_ONCE(!folio_test_locked(folio));
if (zswap_never_enabled())
- return false;
+ return -ENOENT;
/*
* Large folios should not be swapped in while zswap is being used, as
* they are not properly handled. Zswap does not properly load large
* folios, and a large folio may only be partially in zswap.
- *
- * Return true without marking the folio uptodate so that an IO error is
- * emitted (e.g. do_swap_page() will sigbus).
*/
- if (WARN_ON_ONCE(folio_test_large(folio)))
- return true;
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ folio_unlock(folio);
+ return -EINVAL;
+ }
+
+ entry = xa_load(tree, offset);
+ if (!entry)
+ return -ENOENT;
+
+ if (!zswap_decompress(entry, folio)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+
+ folio_mark_uptodate(folio);
+
+ count_vm_event(ZSWPIN);
+ if (entry->objcg)
+ count_objcg_events(entry->objcg, ZSWPIN, 1);
/*
* When reading into the swapcache, invalidate our entry. The
@@ -1645,27 +1709,14 @@ bool zswap_load(struct folio *folio)
* files, which reads into a private page and may free it if
* the fault fails. We remain the primary owner of the entry.)
*/
- if (swapcache)
- entry = xa_erase(tree, offset);
- else
- entry = xa_load(tree, offset);
-
- if (!entry)
- return false;
-
- zswap_decompress(entry, folio);
-
- count_vm_event(ZSWPIN);
- if (entry->objcg)
- count_objcg_events(entry->objcg, ZSWPIN, 1);
-
if (swapcache) {
- zswap_entry_free(entry);
folio_mark_dirty(folio);
+ xa_erase(tree, offset);
+ zswap_entry_free(entry);
}
- folio_mark_uptodate(folio);
- return true;
+ folio_unlock(folio);
+ return 0;
}
void zswap_invalidate(swp_entry_t swp)
@@ -1760,6 +1811,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_fail);
debugfs_create_u64("reject_compress_poor", 0444,
zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("decompress_fail", 0444,
+ zswap_debugfs_root, &zswap_decompress_fail);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
debugfs_create_file("pool_total_size", 0444,