summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig193
-rw-r--r--mm/Kconfig.debug11
-rw-r--r--mm/Makefile14
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/bootmem_info.c4
-rw-r--r--mm/cma.c761
-rw-r--r--mm/cma.h51
-rw-r--r--mm/cma_debug.c61
-rw-r--r--mm/cma_sysfs.c20
-rw-r--r--mm/compaction.c185
-rw-r--r--mm/damon/Kconfig33
-rw-r--r--mm/damon/Makefile1
-rw-r--r--mm/damon/core.c614
-rw-r--r--mm/damon/dbgfs.c1148
-rw-r--r--mm/damon/modules-common.c2
-rw-r--r--mm/damon/modules-common.h2
-rw-r--r--mm/damon/ops-common.c27
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c162
-rw-r--r--mm/damon/reclaim.c2
-rw-r--r--mm/damon/sysfs-common.c2
-rw-r--r--mm/damon/sysfs-common.h18
-rw-r--r--mm/damon/sysfs-schemes.c496
-rw-r--r--mm/damon/sysfs.c541
-rw-r--r--mm/damon/tests/.kunitconfig7
-rw-r--r--mm/damon/tests/core-kunit.h90
-rw-r--r--mm/damon/tests/dbgfs-kunit.h173
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c5
-rw-r--r--mm/debug.c108
-rw-r--r--mm/debug_page_alloc.c2
-rw-r--r--mm/debug_vm_pgtable.c18
-rw-r--r--mm/dmapool.c15
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/execmem.c41
-rw-r--r--mm/filemap.c288
-rw-r--r--mm/folio-compat.c14
-rw-r--r--mm/gup.c289
-rw-r--r--mm/hmm.c264
-rw-r--r--mm/huge_memory.c1119
-rw-r--r--mm/hugetlb.c1404
-rw-r--r--mm/hugetlb_cgroup.c49
-rw-r--r--mm/hugetlb_cma.c278
-rw-r--r--mm/hugetlb_cma.h57
-rw-r--r--mm/hugetlb_vmemmap.c207
-rw-r--r--mm/hugetlb_vmemmap.h23
-rw-r--r--mm/init-mm.c3
-rw-r--r--mm/internal.h217
-rw-r--r--mm/io-mapping.c9
-rw-r--r--mm/ioremap.c4
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/generic.c18
-rw-r--r--mm/kasan/hw_tags.c5
-rw-r--r--mm/kasan/kasan.h18
-rw-r--r--mm/kasan/kasan_test_c.c32
-rw-r--r--mm/kasan/report.c40
-rw-r--r--mm/kasan/shadow.c92
-rw-r--r--mm/kasan/sw_tags.c3
-rw-r--r--mm/kfence/core.c2
-rw-r--r--mm/kfence/kfence_test.c3
-rw-r--r--mm/kfence/report.c3
-rw-r--r--mm/khugepaged.c125
-rw-r--r--mm/kmemleak.c85
-rw-r--r--mm/kmsan/core.c12
-rw-r--r--mm/kmsan/hooks.c7
-rw-r--r--mm/kmsan/init.c3
-rw-r--r--mm/kmsan/instrumentation.c4
-rw-r--r--mm/kmsan/kmsan.h1
-rw-r--r--mm/kmsan/kmsan_test.c1
-rw-r--r--mm/kmsan/report.c6
-rw-r--r--mm/kmsan/shadow.c15
-rw-r--r--mm/ksm.c28
-rw-r--r--mm/list_lru.c17
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c343
-rw-r--r--mm/memblock.c432
-rw-r--r--mm/memcontrol-v1.c133
-rw-r--r--mm/memcontrol-v1.h52
-rw-r--r--mm/memcontrol.c1095
-rw-r--r--mm/memfd.c140
-rw-r--r--mm/memory-failure.c92
-rw-r--r--mm/memory.c1092
-rw-r--r--mm/memory_hotplug.c91
-rw-r--r--mm/mempolicy.c685
-rw-r--r--mm/memremap.c68
-rw-r--r--mm/migrate.c279
-rw-r--r--mm/migrate_device.c147
-rw-r--r--mm/mincore.c24
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c256
-rw-r--r--mm/mmap.c884
-rw-r--r--mm/mmap_lock.c323
-rw-r--r--mm/mmu_gather.c38
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c18
-rw-r--r--mm/mremap.c1480
-rw-r--r--mm/mseal.c6
-rw-r--r--mm/nommu.c150
-rw-r--r--mm/numa.c12
-rw-r--r--mm/numa_emulation.c45
-rw-r--r--mm/numa_memblks.c24
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page-writeback.c148
-rw-r--r--mm/page_alloc.c1290
-rw-r--r--mm/page_counter.c8
-rw-r--r--mm/page_ext.c13
-rw-r--r--mm/page_frag_cache.c6
-rw-r--r--mm/page_idle.c19
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/page_isolation.c31
-rw-r--r--mm/page_owner.c94
-rw-r--r--mm/page_table_check.c78
-rw-r--r--mm/page_vma_mapped.c16
-rw-r--r--mm/percpu.c82
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/pt_reclaim.c71
-rw-r--r--mm/ptdump.c62
-rw-r--r--mm/readahead.c72
-rw-r--r--mm/rmap.c969
-rw-r--r--mm/rodata_test.c7
-rw-r--r--mm/secretmem.c24
-rw-r--r--mm/shmem.c610
-rw-r--r--mm/show_mem.c26
-rw-r--r--mm/shrinker_debug.c24
-rw-r--r--mm/slab.h38
-rw-r--r--mm/slab_common.c920
-rw-r--r--mm/slub.c472
-rw-r--r--mm/sparse-vmemmap.c173
-rw-r--r--mm/sparse.c97
-rw-r--r--mm/swap.c100
-rw-r--r--mm/swap.h52
-rw-r--r--mm/swap_cgroup.c239
-rw-r--r--mm/swap_slots.c353
-rw-r--r--mm/swap_state.c101
-rw-r--r--mm/swapfile.c1719
-rw-r--r--mm/truncate.c120
-rw-r--r--mm/usercopy.c18
-rw-r--r--mm/userfaultfd.c248
-rw-r--r--mm/util.c296
-rw-r--r--mm/vma.c1204
-rw-r--r--mm/vma.h227
-rw-r--r--mm/vma_exec.c161
-rw-r--r--mm/vma_init.c151
-rw-r--r--mm/vma_internal.h1
-rw-r--r--mm/vmalloc.c322
-rw-r--r--mm/vmscan.c976
-rw-r--r--mm/vmstat.c71
-rw-r--r--mm/workingset.c69
-rw-r--r--mm/z3fold.c1447
-rw-r--r--mm/zbud.c455
-rw-r--r--mm/zpdesc.h185
-rw-r--r--mm/zpool.c105
-rw-r--r--mm/zsmalloc.c928
-rw-r--r--mm/zswap.c321
155 files changed, 19220 insertions, 13509 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 84000b016808..781be3240e21 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,7 +129,6 @@ choice
prompt "Default allocator"
depends on ZSWAP
default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
- default ZSWAP_ZPOOL_DEFAULT_ZBUD
help
Selects the default allocator for the compressed cache for
swap pages.
@@ -140,21 +139,6 @@ choice
The selection made here can be overridden by using the kernel
command line 'zswap.zpool=' option.
-config ZSWAP_ZPOOL_DEFAULT_ZBUD
- bool "zbud"
- select ZBUD
- help
- Use the zbud allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
- bool "z3foldi (DEPRECATED)"
- select Z3FOLD_DEPRECATED
- help
- Use the z3fold allocator as the default allocator.
-
- Deprecated and scheduled for removal in a few cycles,
- see CONFIG_Z3FOLD_DEPRECATED.
-
config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
bool "zsmalloc"
select ZSMALLOC
@@ -165,40 +149,9 @@ endchoice
config ZSWAP_ZPOOL_DEFAULT
string
depends on ZSWAP
- default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
- default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
default ""
-config ZBUD
- tristate "2:1 compression allocator (zbud)"
- depends on ZSWAP
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to two compressed pages per physical
- page. While this design limits storage density, it has simple and
- deterministic reclaim properties that make it preferable to a higher
- density approach when reclaim will be used.
-
-config Z3FOLD_DEPRECATED
- tristate "3:1 compression allocator (z3fold) (DEPRECATED)"
- depends on ZSWAP
- help
- Deprecated and scheduled for removal in a few cycles. If you have
- a good reason for using Z3FOLD over ZSMALLOC, please contact
- linux-mm@kvack.org and the zswap maintainers.
-
- A special purpose allocator for storing compressed pages.
- It is designed to store up to three compressed pages per physical
- page. It is a ZBUD derivative so the simplicity and determinism are
- still there.
-
-config Z3FOLD
- tristate
- default y if Z3FOLD_DEPRECATED=y
- default m if Z3FOLD_DEPRECATED=m
- depends on Z3FOLD_DEPRECATED
-
config ZSMALLOC
tristate
prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
@@ -242,9 +195,13 @@ menu "Slab allocator options"
config SLUB
def_bool y
+config KVFREE_RCU_BATCHED
+ def_bool y
+ depends on !SLUB_TINY && !TINY_RCU
+
config SLUB_TINY
bool "Configure for minimal memory footprint"
- depends on EXPERT
+ depends on EXPERT && !COMPILE_TEST
select SLAB_MERGE_DEFAULT
help
Configures the slab allocator in a way to achieve minimal memory
@@ -489,6 +446,9 @@ config SPARSEMEM_VMEMMAP
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+
+config SPARSEMEM_VMEMMAP_PREINIT
+ bool
#
# Select this config option from the architecture Kconfig, if it is preferred
# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
@@ -499,6 +459,9 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
+config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
+ bool
+
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -506,6 +469,10 @@ config HAVE_GUP_FAST
depends on MMU
bool
+# Enable memblock support for scratch memory which is needed for kexec handover
+config MEMBLOCK_KHO_SCRATCH
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -550,20 +517,63 @@ menuconfig MEMORY_HOTPLUG
if MEMORY_HOTPLUG
-config MEMORY_HOTPLUG_DEFAULT_ONLINE
- bool "Online the newly added memory blocks by default"
- depends on MEMORY_HOTPLUG
+choice
+ prompt "Memory Hotplug Default Online Type"
+ default MHP_DEFAULT_ONLINE_TYPE_OFFLINE
help
+ Default memory type for hotplugged memory.
+
This option sets the default policy setting for memory hotplug
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
determines what happens to newly added memory regions. Policy setting
can always be changed at runtime.
+
+ The default is 'offline'.
+
+ Select offline to defer onlining to drivers and user policy.
+ Select auto to let the kernel choose what zones to utilize.
+ Select online_kernel to generally allow kernel usage of this memory.
+ Select online_movable to generally disallow kernel usage of this memory.
+
+ Example kernel usage would be page structs and page tables.
+
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
- Say Y here if you want all hot-plugged memory blocks to appear in
- 'online' state by default.
- Say N here if you want the default policy to keep all hot-plugged
- memory blocks in 'offline' state.
+config MHP_DEFAULT_ONLINE_TYPE_OFFLINE
+ bool "offline"
+ help
+ Hotplugged memory will not be onlined by default.
+ Choose this for systems with drivers and user policy that
+ handle onlining of hotplug memory policy.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO
+ bool "auto"
+ help
+ Select this if you want the kernel to automatically online
+ hotplugged memory into the zone it thinks is reasonable.
+ This memory may be utilized for kernel data.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL
+ bool "kernel"
+ help
+ Select this if you want the kernel to automatically online
+ hotplugged memory into a zone capable of being used for kernel
+ data. This typically means ZONE_NORMAL.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE
+ bool "movable"
+ help
+ Select this if you want the kernel to automatically online
+ hotplug memory into ZONE_MOVABLE. This memory will generally
+ not be utilized for kernel data.
+
+ This should only be used when the admin knows sufficient
+ ZONE_NORMAL memory is available to describe hotplug memory,
+ otherwise hotplug memory may fail to online. For example,
+ sufficient kernel-capable memory (ZONE_NORMAL) must be
+ available to allocate page structs to describe ZONE_MOVABLE.
+
+endchoice
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
@@ -813,11 +823,15 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config MM_ID
+ def_bool n
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
+ select MM_ID
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -872,7 +886,7 @@ config THP_SWAP
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE
help
Allow khugepaged to put read-only file-backed pages in THP.
@@ -881,8 +895,25 @@ config READ_ONLY_THP_FOR_FS
support of file THPs will be developed in the next few release
cycles.
+config NO_PAGE_MAPCOUNT
+ bool "No per-page mapcount (EXPERIMENTAL)"
+ help
+ Do not maintain per-page mapcounts for pages part of larger
+ allocations, such as transparent huge pages.
+
+ When this config option is enabled, some interfaces that relied on
+ this information will rely on less-precise per-allocation information
+ instead: for example, using the average per-page mapcount in such
+ a large allocation instead of the per-page mapcount.
+
+ EXPERIMENTAL because the impact of some changes is still unclear.
+
endif # TRANSPARENT_HUGEPAGE
+# simple helper to make the code a bit easier to read
+config PAGE_MAPCOUNT
+ def_bool !NO_PAGE_MAPCOUNT
+
#
# The architecture supports pgtable leaves that is larger than PAGE_SIZE
#
@@ -962,6 +993,40 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+ int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_ORDER
+ int "Page Block Order"
+ range 1 10 if ARCH_FORCE_MAX_ORDER = 0
+ default 10 if ARCH_FORCE_MAX_ORDER = 0
+ range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ help
+ The page block order refers to the power of two number of pages that
+ are physically contiguous and can have a migrate type associated to
+ them. The maximum size of the page block order is limited by
+ ARCH_FORCE_MAX_ORDER.
+
+ This config allows overriding the default page block order when the
+ page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
+ or MAX_PAGE_ORDER.
+
+ Reducing pageblock order can negatively impact THP generation
+ success rate. If your workloads uses THP heavily, please use this
+ option with caution.
+
+ Don't change if unsure.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
@@ -1290,6 +1355,7 @@ config NUMA_MEMBLKS
config NUMA_EMU
bool "NUMA emulation"
depends on NUMA_MEMBLKS
+ depends on X86 || GENERIC_ARCH_NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1301,6 +1367,21 @@ config ARCH_HAS_USER_SHADOW_STACK
The architecture has hardware support for userspace shadow call
stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+config ARCH_SUPPORTS_PT_RECLAIM
+ def_bool n
+
+config PT_RECLAIM
+ bool "reclaim empty user page table pages"
+ default y
+ depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP
+ select MMU_GATHER_RCU_TABLE_FREE
+ help
+ Try to reclaim empty user page table pages in paths other than munmap
+ and exit_mmap path.
+
+ Note: now only empty user PTE page table pages will be reclaimed.
+
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 41a58536531d..32b65073d0cc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -186,8 +186,9 @@ config ARCH_HAS_DEBUG_WX
config DEBUG_WX
bool "Warn on W+X mappings at boot"
depends on ARCH_HAS_DEBUG_WX
+ depends on ARCH_HAS_PTDUMP
depends on MMU
- select PTDUMP_CORE
+ select PTDUMP
help
Generate a warning if any W+X mappings are found at boot.
@@ -212,18 +213,18 @@ config DEBUG_WX
If in doubt, say "Y".
-config GENERIC_PTDUMP
+config ARCH_HAS_PTDUMP
bool
-config PTDUMP_CORE
+config PTDUMP
bool
config PTDUMP_DEBUGFS
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
depends on DEBUG_FS
- depends on GENERIC_PTDUMP
- select PTDUMP_CORE
+ depends on ARCH_HAS_PTDUMP
+ select PTDUMP
help
Say Y here if you want to show the kernel pagetable layout in a
debugfs file. This information is only useful for kernel developers
diff --git a/mm/Makefile b/mm/Makefile
index dba52bb0da8a..1a7a11d4933d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
@@ -75,10 +75,13 @@ ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+ifdef CONFIG_CMA
+obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o
+endif
obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
@@ -113,9 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
-obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
-obj-$(CONFIG_Z3FOLD) += z3fold.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_NUMA) += numa.o
@@ -138,7 +139,7 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
-obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
@@ -146,3 +147,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
+obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..783904d8c5ef 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- del_timer_sync(&bdi->laptop_mode_wb_timer);
+ timer_delete_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6597ebea8ae2..d3e00731e262 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,6 +24,7 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
balloon_page_insert(b_dev_info, page);
unlock_page(page);
__count_vm_event(BALLOON_INFLATE);
+ inc_node_page_state(page, NR_BALLOON_PAGES);
}
/**
@@ -103,6 +104,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
__count_vm_event(BALLOON_DEFLATE);
list_add(&page->lru, pages);
unlock_page(page);
+ dec_node_page_state(page, NR_BALLOON_PAGES);
n_pages++;
}
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index 95f288169a38..b0e2a9fa641f 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -88,7 +88,9 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn)
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+ if (!preinited_vmemmap_section(ms))
+ register_page_bootmem_memmap(section_nr, memmap,
+ PAGES_PER_SECTION);
usage = ms->usage;
page = virt_to_page(usage);
diff --git a/mm/cma.c b/mm/cma.c
index de5bc0c81fc2..397567883a10 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -18,6 +18,7 @@
#include <linux/memblock.h>
#include <linux/err.h>
+#include <linux/list.h>
#include <linux/mm.h>
#include <linux/sizes.h>
#include <linux/slab.h>
@@ -33,11 +34,17 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static DEFINE_MUTEX(cma_mutex);
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid);
phys_addr_t cma_get_base(const struct cma *cma)
{
- return PFN_PHYS(cma->base_pfn);
+ WARN_ON_ONCE(cma->nranges != 1);
+ return PFN_PHYS(cma->ranges[0].base_pfn);
}
unsigned long cma_get_size(const struct cma *cma)
@@ -63,9 +70,10 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
* The value returned is represented in order_per_bits.
*/
static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ const struct cma_memrange *cmr,
unsigned int align_order)
{
- return (cma->base_pfn & ((1UL << align_order) - 1))
+ return (cmr->base_pfn & ((1UL << align_order) - 1))
>> cma->order_per_bit;
}
@@ -75,65 +83,123 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
- unsigned long count)
+static void cma_clear_bitmap(struct cma *cma, const struct cma_memrange *cmr,
+ unsigned long pfn, unsigned long count)
{
unsigned long bitmap_no, bitmap_count;
unsigned long flags;
- bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
+ bitmap_no = (pfn - cmr->base_pfn) >> cma->order_per_bit;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
spin_lock_irqsave(&cma->lock, flags);
- bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
+ bitmap_clear(cmr->bitmap, bitmap_no, bitmap_count);
+ cma->available_count += count;
spin_unlock_irqrestore(&cma->lock, flags);
}
-static void __init cma_activate_area(struct cma *cma)
+/*
+ * Check if a CMA area contains no ranges that intersect with
+ * multiple zones. Store the result in the flags in case
+ * this gets called more than once.
+ */
+bool cma_validate_zones(struct cma *cma)
{
- unsigned long base_pfn = cma->base_pfn, pfn;
- struct zone *zone;
-
- cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap)
- goto out_error;
+ int r;
+ unsigned long base_pfn;
+ struct cma_memrange *cmr;
+ bool valid_bit_set;
/*
- * alloc_contig_range() requires the pfn range specified to be in the
- * same zone. Simplify by forcing the entire CMA resv range to be in the
- * same zone.
+ * If already validated, return result of previous check.
+ * Either the valid or invalid bit will be set if this
+ * check has already been done. If neither is set, the
+ * check has not been performed yet.
*/
- WARN_ON_ONCE(!pfn_valid(base_pfn));
- zone = page_zone(pfn_to_page(base_pfn));
- for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
- if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
+ valid_bit_set = test_bit(CMA_ZONES_VALID, &cma->flags);
+ if (valid_bit_set || test_bit(CMA_ZONES_INVALID, &cma->flags))
+ return valid_bit_set;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ base_pfn = cmr->base_pfn;
+
+ /*
+ * alloc_contig_range() requires the pfn range specified
+ * to be in the same zone. Simplify by forcing the entire
+ * CMA resv range to be in the same zone.
+ */
+ WARN_ON_ONCE(!pfn_valid(base_pfn));
+ if (pfn_range_intersects_zones(cma->nid, base_pfn, cmr->count)) {
+ set_bit(CMA_ZONES_INVALID, &cma->flags);
+ return false;
+ }
}
- for (pfn = base_pfn; pfn < base_pfn + cma->count;
- pfn += pageblock_nr_pages)
- init_cma_reserved_pageblock(pfn_to_page(pfn));
+ set_bit(CMA_ZONES_VALID, &cma->flags);
+
+ return true;
+}
+
+static void __init cma_activate_area(struct cma *cma)
+{
+ unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES];
+ int allocrange, r;
+ struct cma_memrange *cmr;
+ unsigned long bitmap_count, count;
+
+ for (allocrange = 0; allocrange < cma->nranges; allocrange++) {
+ cmr = &cma->ranges[allocrange];
+ early_pfn[allocrange] = cmr->early_pfn;
+ cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr),
+ GFP_KERNEL);
+ if (!cmr->bitmap)
+ goto cleanup;
+ }
+
+ if (!cma_validate_zones(cma))
+ goto cleanup;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ if (early_pfn[r] != cmr->base_pfn) {
+ count = early_pfn[r] - cmr->base_pfn;
+ bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+ bitmap_set(cmr->bitmap, 0, bitmap_count);
+ }
+
+ for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
spin_lock_init(&cma->lock);
+ mutex_init(&cma->alloc_mutex);
+
#ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
spin_lock_init(&cma->mem_head_lock);
#endif
+ set_bit(CMA_ACTIVATED, &cma->flags);
return;
-not_in_zone:
- bitmap_free(cma->bitmap);
-out_error:
+cleanup:
+ for (r = 0; r < allocrange; r++)
+ bitmap_free(cma->ranges[r].bitmap);
+
/* Expose all pages to the buddy, they are useless for CMA. */
- if (!cma->reserve_pages_on_error) {
- for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
- free_reserved_page(pfn_to_page(pfn));
+ if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) {
+ for (r = 0; r < allocrange; r++) {
+ cmr = &cma->ranges[r];
+ end_pfn = cmr->base_pfn + cmr->count;
+ for (pfn = early_pfn[r]; pfn < end_pfn; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ }
}
totalcma_pages -= cma->count;
- cma->count = 0;
+ cma->available_count = cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
}
@@ -150,7 +216,44 @@ core_initcall(cma_init_reserved_areas);
void __init cma_reserve_pages_on_error(struct cma *cma)
{
- cma->reserve_pages_on_error = true;
+ set_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags);
+}
+
+static int __init cma_new_area(const char *name, phys_addr_t size,
+ unsigned int order_per_bit,
+ struct cma **res_cma)
+{
+ struct cma *cma;
+
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ cma = &cma_areas[cma_area_count];
+ cma_area_count++;
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, "%s", name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
+ cma->available_count = cma->count = size >> PAGE_SHIFT;
+ cma->order_per_bit = order_per_bit;
+ *res_cma = cma;
+ totalcma_pages += cma->count;
+
+ return 0;
+}
+
+static void __init cma_drop_area(struct cma *cma)
+{
+ totalcma_pages -= cma->count;
+ cma_area_count--;
}
/**
@@ -171,13 +274,9 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
struct cma **res_cma)
{
struct cma *cma;
+ int ret;
/* Sanity checks */
- if (cma_area_count == ARRAY_SIZE(cma_areas)) {
- pr_err("Not enough slots for CMA reserved regions!\n");
- return -ENOSPC;
- }
-
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
@@ -194,25 +293,264 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
+ ret = cma_new_area(name, size, order_per_bit, &cma);
+ if (ret != 0)
+ return ret;
+
+ cma->ranges[0].base_pfn = PFN_DOWN(base);
+ cma->ranges[0].early_pfn = PFN_DOWN(base);
+ cma->ranges[0].count = cma->count;
+ cma->nranges = 1;
+ cma->nid = NUMA_NO_NODE;
+
+ *res_cma = cma;
+
+ return 0;
+}
+
+/*
+ * Structure used while walking physical memory ranges and finding out
+ * which one(s) to use for a CMA area.
+ */
+struct cma_init_memrange {
+ phys_addr_t base;
+ phys_addr_t size;
+ struct list_head list;
+};
+
+/*
+ * Work array used during CMA initialization.
+ */
+static struct cma_init_memrange memranges[CMA_MAX_RANGES] __initdata;
+
+static bool __init revsizecmp(struct cma_init_memrange *mlp,
+ struct cma_init_memrange *mrp)
+{
+ return mlp->size > mrp->size;
+}
+
+static bool __init basecmp(struct cma_init_memrange *mlp,
+ struct cma_init_memrange *mrp)
+{
+ return mlp->base < mrp->base;
+}
+
+/*
+ * Helper function to create sorted lists.
+ */
+static void __init list_insert_sorted(
+ struct list_head *ranges,
+ struct cma_init_memrange *mrp,
+ bool (*cmp)(struct cma_init_memrange *lh, struct cma_init_memrange *rh))
+{
+ struct list_head *mp;
+ struct cma_init_memrange *mlp;
+
+ if (list_empty(ranges))
+ list_add(&mrp->list, ranges);
+ else {
+ list_for_each(mp, ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ if (cmp(mlp, mrp))
+ break;
+ }
+ __list_add(&mrp->list, mlp->list.prev, &mlp->list);
+ }
+}
+
+/*
+ * Create CMA areas with a total size of @total_size. A normal allocation
+ * for one area is tried first. If that fails, the biggest memblock
+ * ranges above 4G are selected, and allocated bottom up.
+ *
+ * The complexity here is not great, but this function will only be
+ * called during boot, and the lists operated on have fewer than
+ * CMA_MAX_RANGES elements (default value: 8).
+ */
+int __init cma_declare_contiguous_multi(phys_addr_t total_size,
+ phys_addr_t align, unsigned int order_per_bit,
+ const char *name, struct cma **res_cma, int nid)
+{
+ phys_addr_t start = 0, end;
+ phys_addr_t size, sizesum, sizeleft;
+ struct cma_init_memrange *mrp, *mlp, *failed;
+ struct cma_memrange *cmrp;
+ LIST_HEAD(ranges);
+ LIST_HEAD(final_ranges);
+ struct list_head *mp, *next;
+ int ret, nr = 1;
+ u64 i;
+ struct cma *cma;
+
/*
- * Each reserved area must be initialised later, when more kernel
- * subsystems (like slab allocator) are available.
+ * First, try it the normal way, producing just one range.
*/
- cma = &cma_areas[cma_area_count];
+ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align,
+ order_per_bit, false, name, res_cma, nid);
+ if (ret != -ENOMEM)
+ goto out;
- if (name)
- snprintf(cma->name, CMA_MAX_NAME, name);
- else
- snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+ /*
+ * Couldn't find one range that fits our needs, so try multiple
+ * ranges.
+ *
+ * No need to do the alignment checks here, the call to
+ * cma_declare_contiguous_nid above would have caught
+ * any issues. With the checks, we know that:
+ *
+ * - @align is a power of 2
+ * - @align is >= pageblock alignment
+ * - @size is aligned to @align and to @order_per_bit
+ *
+ * So, as long as we create ranges that have a base
+ * aligned to @align, and a size that is aligned to
+ * both @align and @order_to_bit, things will work out.
+ */
+ nr = 0;
+ sizesum = 0;
+ failed = NULL;
- cma->base_pfn = PFN_DOWN(base);
- cma->count = size >> PAGE_SHIFT;
- cma->order_per_bit = order_per_bit;
+ ret = cma_new_area(name, total_size, order_per_bit, &cma);
+ if (ret != 0)
+ goto out;
+
+ align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES);
+ /*
+ * Create a list of ranges above 4G, largest range first.
+ */
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (upper_32_bits(start) == 0)
+ continue;
+
+ start = ALIGN(start, align);
+ if (start >= end)
+ continue;
+
+ end = ALIGN_DOWN(end, align);
+ if (end <= start)
+ continue;
+
+ size = end - start;
+ size = ALIGN_DOWN(size, (PAGE_SIZE << order_per_bit));
+ if (!size)
+ continue;
+ sizesum += size;
+
+ pr_debug("consider %016llx - %016llx\n", (u64)start, (u64)end);
+
+ /*
+ * If we don't yet have used the maximum number of
+ * areas, grab a new one.
+ *
+ * If we can't use anymore, see if this range is not
+ * smaller than the smallest one already recorded. If
+ * not, re-use the smallest element.
+ */
+ if (nr < CMA_MAX_RANGES)
+ mrp = &memranges[nr++];
+ else {
+ mrp = list_last_entry(&ranges,
+ struct cma_init_memrange, list);
+ if (size < mrp->size)
+ continue;
+ list_del(&mrp->list);
+ sizesum -= mrp->size;
+ pr_debug("deleted %016llx - %016llx from the list\n",
+ (u64)mrp->base, (u64)mrp->base + size);
+ }
+ mrp->base = start;
+ mrp->size = size;
+
+ /*
+ * Now do a sorted insert.
+ */
+ list_insert_sorted(&ranges, mrp, revsizecmp);
+ pr_debug("added %016llx - %016llx to the list\n",
+ (u64)mrp->base, (u64)mrp->base + size);
+ pr_debug("total size now %llu\n", (u64)sizesum);
+ }
+
+ /*
+ * There is not enough room in the CMA_MAX_RANGES largest
+ * ranges, so bail out.
+ */
+ if (sizesum < total_size) {
+ cma_drop_area(cma);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Found ranges that provide enough combined space.
+ * Now, sorted them by address, smallest first, because we
+ * want to mimic a bottom-up memblock allocation.
+ */
+ sizesum = 0;
+ list_for_each_safe(mp, next, &ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ list_del(mp);
+ list_insert_sorted(&final_ranges, mlp, basecmp);
+ sizesum += mlp->size;
+ if (sizesum >= total_size)
+ break;
+ }
+
+ /*
+ * Walk the final list, and add a CMA range for
+ * each range, possibly not using the last one fully.
+ */
+ nr = 0;
+ sizeleft = total_size;
+ list_for_each(mp, &final_ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ size = min(sizeleft, mlp->size);
+ if (memblock_reserve(mlp->base, size)) {
+ /*
+ * Unexpected error. Could go on to
+ * the next one, but just abort to
+ * be safe.
+ */
+ failed = mlp;
+ break;
+ }
+
+ pr_debug("created region %d: %016llx - %016llx\n",
+ nr, (u64)mlp->base, (u64)mlp->base + size);
+ cmrp = &cma->ranges[nr++];
+ cmrp->base_pfn = PHYS_PFN(mlp->base);
+ cmrp->early_pfn = cmrp->base_pfn;
+ cmrp->count = size >> PAGE_SHIFT;
+
+ sizeleft -= size;
+ if (sizeleft == 0)
+ break;
+ }
+
+ if (failed) {
+ list_for_each(mp, &final_ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ if (mlp == failed)
+ break;
+ memblock_phys_free(mlp->base, mlp->size);
+ }
+ cma_drop_area(cma);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cma->nranges = nr;
+ cma->nid = nid;
*res_cma = cma;
- cma_area_count++;
- totalcma_pages += cma->count;
- return 0;
+out:
+ if (ret != 0)
+ pr_err("Failed to reserve %lu MiB\n",
+ (unsigned long)total_size / SZ_1M);
+ else
+ pr_info("Reserved %lu MiB in %d range%s\n",
+ (unsigned long)total_size / SZ_1M, nr,
+ nr > 1 ? "s" : "");
+ return ret;
}
/**
@@ -241,8 +579,28 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
bool fixed, const char *name, struct cma **res_cma,
int nid)
{
+ int ret;
+
+ ret = __cma_declare_contiguous_nid(&base, size, limit, alignment,
+ order_per_bit, fixed, name, res_cma, nid);
+ if (ret != 0)
+ pr_err("Failed to reserve %ld MiB\n",
+ (unsigned long)size / SZ_1M);
+ else
+ pr_info("Reserved %ld MiB at %pa\n",
+ (unsigned long)size / SZ_1M, &base);
+
+ return ret;
+}
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
+{
phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start;
+ phys_addr_t highmem_start, base = *basep;
int ret;
/*
@@ -251,7 +609,10 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
* complain. Find the boundary by adding one to the last valid
* address.
*/
- highmem_start = __pa(high_memory - 1) + 1;
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ highmem_start = __pa(high_memory - 1) + 1;
+ else
+ highmem_start = memblock_end_of_DRAM();
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
@@ -272,10 +633,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
/* Sanitise input arguments. */
alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
- ret = -EINVAL;
pr_err("Region at %pa must be aligned to %pa bytes\n",
&base, &alignment);
- goto err;
+ return -EINVAL;
}
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
@@ -293,10 +653,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
* low/high memory boundary.
*/
if (fixed && base < highmem_start && base + size > highmem_start) {
- ret = -EINVAL;
pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
&base, &highmem_start);
- goto err;
+ return -EINVAL;
}
/*
@@ -308,18 +667,16 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
limit = memblock_end;
if (base + size > limit) {
- ret = -EINVAL;
pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
&size, &base, &limit);
- goto err;
+ return -EINVAL;
}
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
memblock_reserve(base, size) < 0) {
- ret = -EBUSY;
- goto err;
+ return -EBUSY;
}
} else {
phys_addr_t addr = 0;
@@ -356,10 +713,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
limit, nid, true);
- if (!addr) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!addr)
+ return -ENOMEM;
}
/*
@@ -371,87 +726,93 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
}
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
- if (ret)
- goto free_mem;
+ if (ret) {
+ memblock_phys_free(base, size);
+ return ret;
+ }
- pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M,
- &base, nid);
- return 0;
+ (*res_cma)->nid = nid;
+ *basep = base;
-free_mem:
- memblock_phys_free(base, size);
-err:
- pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M,
- nid);
- return ret;
+ return 0;
}
static void cma_debug_show_areas(struct cma *cma)
{
unsigned long next_zero_bit, next_set_bit, nr_zero;
- unsigned long start = 0;
- unsigned long nr_part, nr_total = 0;
- unsigned long nbits = cma_bitmap_maxno(cma);
+ unsigned long start;
+ unsigned long nr_part;
+ unsigned long nbits;
+ int r;
+ struct cma_memrange *cmr;
spin_lock_irq(&cma->lock);
pr_info("number of available pages: ");
- for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
- if (next_zero_bit >= nbits)
- break;
- next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
- nr_zero = next_set_bit - next_zero_bit;
- nr_part = nr_zero << cma->order_per_bit;
- pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
- next_zero_bit);
- nr_total += nr_part;
- start = next_zero_bit + nr_zero;
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+
+ start = 0;
+ nbits = cma_bitmap_maxno(cma, cmr);
+
+ pr_info("range %d: ", r);
+ for (;;) {
+ next_zero_bit = find_next_zero_bit(cmr->bitmap,
+ nbits, start);
+ if (next_zero_bit >= nbits)
+ break;
+ next_set_bit = find_next_bit(cmr->bitmap, nbits,
+ next_zero_bit);
+ nr_zero = next_set_bit - next_zero_bit;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", start ? "+" : "", nr_part,
+ next_zero_bit);
+ start = next_zero_bit + nr_zero;
+ }
+ pr_info("\n");
}
- pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", cma->available_count,
+ cma->count);
spin_unlock_irq(&cma->lock);
}
-static struct page *__cma_alloc(struct cma *cma, unsigned long count,
- unsigned int align, gfp_t gfp)
+static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
+ unsigned long count, unsigned int align,
+ struct page **pagep, gfp_t gfp)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
- unsigned long i;
+ int ret = -EBUSY;
struct page *page = NULL;
- int ret = -ENOMEM;
- const char *name = cma ? cma->name : NULL;
-
- trace_cma_alloc_start(name, count, align);
-
- if (!cma || !cma->count || !cma->bitmap)
- return page;
-
- pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
- (void *)cma, cma->name, count, align);
-
- if (!count)
- return page;
mask = cma_bitmap_aligned_mask(cma, align);
- offset = cma_bitmap_aligned_offset(cma, align);
- bitmap_maxno = cma_bitmap_maxno(cma);
+ offset = cma_bitmap_aligned_offset(cma, cmr, align);
+ bitmap_maxno = cma_bitmap_maxno(cma, cmr);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- return page;
+ goto out;
for (;;) {
spin_lock_irq(&cma->lock);
- bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
+ /*
+ * If the request is larger than the available number
+ * of pages, stop right away.
+ */
+ if (count > cma->available_count) {
+ spin_unlock_irq(&cma->lock);
+ break;
+ }
+ bitmap_no = bitmap_find_next_zero_area_off(cmr->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
spin_unlock_irq(&cma->lock);
break;
}
- bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
+ bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
+ cma->available_count -= count;
/*
* It's safe to drop the lock here. We've marked this region for
* our exclusive use. If the migration fails we will take the
@@ -459,16 +820,16 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
*/
spin_unlock_irq(&cma->lock);
- pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
- mutex_lock(&cma_mutex);
+ pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
+ mutex_lock(&cma->alloc_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
- mutex_unlock(&cma_mutex);
+ mutex_unlock(&cma->alloc_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
break;
}
- cma_clear_bitmap(cma, pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
if (ret != -EBUSY)
break;
@@ -480,6 +841,38 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
+out:
+ *pagep = page;
+ return ret;
+}
+
+static struct page *__cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, gfp_t gfp)
+{
+ struct page *page = NULL;
+ int ret = -ENOMEM, r;
+ unsigned long i;
+ const char *name = cma ? cma->name : NULL;
+
+ trace_cma_alloc_start(name, count, align);
+
+ if (!cma || !cma->count)
+ return page;
+
+ pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
+ (void *)cma, cma->name, count, align);
+
+ if (!count)
+ return page;
+
+ for (r = 0; r < cma->nranges; r++) {
+ page = NULL;
+
+ ret = cma_range_alloc(cma, &cma->ranges[r], count, align,
+ &page, gfp);
+ if (ret != -EBUSY || page)
+ break;
+ }
/*
* CMA can allocate multiple page blocks, which results in different
@@ -498,7 +891,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
}
pr_debug("%s(): returned %p\n", __func__, page);
- trace_cma_alloc_finish(name, pfn, page, count, align, ret);
+ trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0,
+ page, count, align, ret);
if (page) {
count_vm_event(CMA_ALLOC_SUCCESS);
cma_sysfs_account_success_pages(cma, count);
@@ -541,20 +935,31 @@ struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
bool cma_pages_valid(struct cma *cma, const struct page *pages,
unsigned long count)
{
- unsigned long pfn;
+ unsigned long pfn, end;
+ int r;
+ struct cma_memrange *cmr;
+ bool ret;
- if (!cma || !pages)
+ if (!cma || !pages || count > cma->count)
return false;
pfn = page_to_pfn(pages);
+ ret = false;
- if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
- pr_debug("%s(page %p, count %lu)\n", __func__,
- (void *)pages, count);
- return false;
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ end = cmr->base_pfn + cmr->count;
+ if (pfn >= cmr->base_pfn && pfn < end) {
+ ret = pfn + count <= end;
+ break;
+ }
}
- return true;
+ if (!ret)
+ pr_debug("%s(page %p, count %lu)\n",
+ __func__, (void *)pages, count);
+
+ return ret;
}
/**
@@ -570,19 +975,32 @@ bool cma_pages_valid(struct cma *cma, const struct page *pages,
bool cma_release(struct cma *cma, const struct page *pages,
unsigned long count)
{
- unsigned long pfn;
+ struct cma_memrange *cmr;
+ unsigned long pfn, end_pfn;
+ int r;
+
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
if (!cma_pages_valid(cma, pages, count))
return false;
- pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
-
pfn = page_to_pfn(pages);
+ end_pfn = pfn + count;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ if (pfn >= cmr->base_pfn &&
+ pfn < (cmr->base_pfn + cmr->count)) {
+ VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count);
+ break;
+ }
+ }
- VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+ if (r == cma->nranges)
+ return false;
free_contig_range(pfn, count);
- cma_clear_bitmap(cma, pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
cma_sysfs_account_release_pages(cma, count);
trace_cma_release(cma->name, pfn, pages, count);
@@ -610,3 +1028,86 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
return 0;
}
+
+bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end)
+{
+ int r;
+ struct cma_memrange *cmr;
+ unsigned long rstart, rend;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+
+ rstart = PFN_PHYS(cmr->base_pfn);
+ rend = PFN_PHYS(cmr->base_pfn + cmr->count);
+ if (end < rstart)
+ continue;
+ if (start >= rend)
+ continue;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Very basic function to reserve memory from a CMA area that has not
+ * yet been activated. This is expected to be called early, when the
+ * system is single-threaded, so there is no locking. The alignment
+ * checking is restrictive - only pageblock-aligned areas
+ * (CMA_MIN_ALIGNMENT_BYTES) may be reserved through this function.
+ * This keeps things simple, and is enough for the current use case.
+ *
+ * The CMA bitmaps have not yet been allocated, so just start
+ * reserving from the bottom up, using a PFN to keep track
+ * of what has been reserved. Unreserving is not possible.
+ *
+ * The caller is responsible for initializing the page structures
+ * in the area properly, since this just points to memblock-allocated
+ * memory. The caller should subsequently use init_cma_pageblock to
+ * set the migrate type and CMA stats the pageblocks that were reserved.
+ *
+ * If the CMA area fails to activate later, memory obtained through
+ * this interface is not handed to the page allocator, this is
+ * the responsibility of the caller (e.g. like normal memblock-allocated
+ * memory).
+ */
+void __init *cma_reserve_early(struct cma *cma, unsigned long size)
+{
+ int r;
+ struct cma_memrange *cmr;
+ unsigned long available;
+ void *ret = NULL;
+
+ if (!cma || !cma->count)
+ return NULL;
+ /*
+ * Can only be called early in init.
+ */
+ if (test_bit(CMA_ACTIVATED, &cma->flags))
+ return NULL;
+
+ if (!IS_ALIGNED(size, CMA_MIN_ALIGNMENT_BYTES))
+ return NULL;
+
+ if (!IS_ALIGNED(size, (PAGE_SIZE << cma->order_per_bit)))
+ return NULL;
+
+ size >>= PAGE_SHIFT;
+
+ if (size > cma->available_count)
+ return NULL;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ available = cmr->count - (cmr->early_pfn - cmr->base_pfn);
+ if (size <= available) {
+ ret = phys_to_virt(PFN_PHYS(cmr->early_pfn));
+ cmr->early_pfn += size;
+ cma->available_count -= size;
+ return ret;
+ }
+ }
+
+ return ret;
+}
diff --git a/mm/cma.h b/mm/cma.h
index ad61cc6dd439..c70180c36559 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -10,18 +10,45 @@ struct cma_kobject {
struct cma *cma;
};
+/*
+ * Multi-range support. This can be useful if the size of the allocation
+ * is not expected to be larger than the alignment (like with hugetlb_cma),
+ * and the total amount of memory requested, while smaller than the total
+ * amount of memory available, is large enough that it doesn't fit in a
+ * single physical memory range because of memory holes.
+ *
+ * Fields:
+ * @base_pfn: physical address of range
+ * @early_pfn: first PFN not reserved through cma_reserve_early
+ * @count: size of range
+ * @bitmap: bitmap of allocated (1 << order_per_bit)-sized chunks.
+ */
+struct cma_memrange {
+ unsigned long base_pfn;
+ unsigned long count;
+ union {
+ unsigned long early_pfn;
+ unsigned long *bitmap;
+ };
+#ifdef CONFIG_CMA_DEBUGFS
+ struct debugfs_u32_array dfs_bitmap;
+#endif
+};
+#define CMA_MAX_RANGES 8
+
struct cma {
- unsigned long base_pfn;
unsigned long count;
- unsigned long *bitmap;
+ unsigned long available_count;
unsigned int order_per_bit; /* Order of pages represented by one bit */
spinlock_t lock;
+ struct mutex alloc_mutex;
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
- struct debugfs_u32_array dfs_bitmap;
#endif
char name[CMA_MAX_NAME];
+ int nranges;
+ struct cma_memrange ranges[CMA_MAX_RANGES];
#ifdef CONFIG_CMA_SYSFS
/* the number of CMA page successful allocations */
atomic64_t nr_pages_succeeded;
@@ -32,15 +59,25 @@ struct cma {
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
- bool reserve_pages_on_error;
+ unsigned long flags;
+ /* NUMA node (NUMA_NO_NODE if unspecified) */
+ int nid;
+};
+
+enum cma_flags {
+ CMA_RESERVE_PAGES_ON_ERROR,
+ CMA_ZONES_VALID,
+ CMA_ZONES_INVALID,
+ CMA_ACTIVATED,
};
extern struct cma cma_areas[MAX_CMA_AREAS];
-extern unsigned cma_area_count;
+extern unsigned int cma_area_count;
-static inline unsigned long cma_bitmap_maxno(struct cma *cma)
+static inline unsigned long cma_bitmap_maxno(struct cma *cma,
+ struct cma_memrange *cmr)
{
- return cma->count >> cma->order_per_bit;
+ return cmr->count >> cma->order_per_bit;
}
#ifdef CONFIG_CMA_SYSFS
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 602fff89b15f..fdf899532ca0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -34,13 +34,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
static int cma_used_get(void *data, u64 *val)
{
struct cma *cma = data;
- unsigned long used;
spin_lock_irq(&cma->lock);
- /* pages counter is smaller than sizeof(int) */
- used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
+ *val = cma->count - cma->available_count;
spin_unlock_irq(&cma->lock);
- *val = (u64)used << cma->order_per_bit;
return 0;
}
@@ -49,17 +46,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
static int cma_maxchunk_get(void *data, u64 *val)
{
struct cma *cma = data;
+ struct cma_memrange *cmr;
unsigned long maxchunk = 0;
- unsigned long start, end = 0;
- unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
+ unsigned long start, end;
+ unsigned long bitmap_maxno;
+ int r;
spin_lock_irq(&cma->lock);
- for (;;) {
- start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= bitmap_maxno)
- break;
- end = find_next_bit(cma->bitmap, bitmap_maxno, start);
- maxchunk = max(end - start, maxchunk);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ bitmap_maxno = cma_bitmap_maxno(cma, cmr);
+ end = 0;
+ for (;;) {
+ start = find_next_zero_bit(cmr->bitmap,
+ bitmap_maxno, end);
+ if (start >= bitmap_maxno)
+ break;
+ end = find_next_bit(cmr->bitmap, bitmap_maxno,
+ start);
+ maxchunk = max(end - start, maxchunk);
+ }
}
spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
@@ -162,24 +168,41 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
- struct dentry *tmp;
+ struct dentry *tmp, *dir, *rangedir;
+ int r;
+ char rdirname[12];
+ struct cma_memrange *cmr;
tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
- debugfs_create_file("base_pfn", 0444, tmp,
- &cma->base_pfn, &cma_debugfs_fops);
debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops);
debugfs_create_file("order_per_bit", 0444, tmp,
&cma->order_per_bit, &cma_debugfs_fops);
debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
- cma->dfs_bitmap.array = (u32 *)cma->bitmap;
- cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma),
- BITS_PER_BYTE * sizeof(u32));
- debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap);
+ rangedir = debugfs_create_dir("ranges", tmp);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ snprintf(rdirname, sizeof(rdirname), "%d", r);
+ dir = debugfs_create_dir(rdirname, rangedir);
+ debugfs_create_file("base_pfn", 0444, dir,
+ &cmr->base_pfn, &cma_debugfs_fops);
+ cmr->dfs_bitmap.array = (u32 *)cmr->bitmap;
+ cmr->dfs_bitmap.n_elements =
+ DIV_ROUND_UP(cma_bitmap_maxno(cma, cmr),
+ BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", 0444, dir,
+ &cmr->dfs_bitmap);
+ }
+
+ /*
+ * Backward compatible symlinks to range 0 for base_pfn and bitmap.
+ */
+ debugfs_create_symlink("base_pfn", tmp, "ranges/0/base_pfn");
+ debugfs_create_symlink("bitmap", tmp, "ranges/0/bitmap");
}
static int __init cma_debugfs_init(void)
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index f50db3973171..97acd3e5a6a5 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -62,6 +62,24 @@ static ssize_t release_pages_success_show(struct kobject *kobj,
}
CMA_ATTR_RO(release_pages_success);
+static ssize_t total_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%lu\n", cma->count);
+}
+CMA_ATTR_RO(total_pages);
+
+static ssize_t available_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%lu\n", cma->available_count);
+}
+CMA_ATTR_RO(available_pages);
+
static void cma_kobj_release(struct kobject *kobj)
{
struct cma *cma = cma_from_kobj(kobj);
@@ -75,6 +93,8 @@ static struct attribute *cma_attrs[] = {
&alloc_pages_success_attr.attr,
&alloc_pages_fail_attr.attr,
&release_pages_success_attr.attr,
+ &total_pages_attr.attr,
+ &available_pages_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(cma);
diff --git a/mm/compaction.c b/mm/compaction.c
index a2b16b08cbbf..3925cb61dbb8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; }
static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags)
{
post_alloc_hook(page, order, __GFP_MOVABLE);
+ set_page_refcounted(page);
return page;
}
#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__))
@@ -630,7 +631,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (blockpfn + (1UL << order) <= end_pfn) {
+ if ((order <= MAX_PAGE_ORDER) &&
+ (blockpfn + (1UL << order) <= end_pfn)) {
blockpfn += (1UL << order) - 1;
page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
@@ -979,13 +981,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page)) {
+ const unsigned int order = compound_order(page);
/*
* skip hugetlbfs if we are not compacting for pages
* bigger than its order. THPs and other compound pages
* are handled below.
*/
if (!cc->alloc_contig) {
- const unsigned int order = compound_order(page);
if (order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << order) - 1;
@@ -999,27 +1001,27 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+ folio = page_folio(page);
+ ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages);
/*
- * Fail isolation in case isolate_or_dissolve_huge_page()
+ * Fail isolation in case isolate_or_dissolve_huge_folio()
* reports an error. In case of -ENOMEM, abort right away.
*/
if (ret < 0) {
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
goto isolate_fail;
}
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- folio = page_folio(page);
low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
@@ -1868,6 +1870,7 @@ again:
dst = (struct folio *)freepage;
post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
+ set_page_refcounted(&dst->page);
if (order)
prep_compound_page(&dst->page, order);
cc->nr_freepages -= 1 << order;
@@ -2246,15 +2249,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
static unsigned int fragmentation_score_wmark(bool low)
{
- unsigned int wmark_low;
+ unsigned int wmark_low, leeway;
- /*
- * Cap the low watermark to avoid excessive compaction
- * activity in case a user sets the proactiveness tunable
- * close to 100 (maximum).
- */
- wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
- return low ? wmark_low : min(wmark_low + 10, 100U);
+ wmark_low = 100U - sysctl_compaction_proactiveness;
+ leeway = min(10U, wmark_low / 2);
+ return low ? wmark_low : min(wmark_low + leeway, 100U);
}
static bool should_proactive_compact_node(pg_data_t *pgdat)
@@ -2325,11 +2324,26 @@ static enum compact_result __compact_finished(struct compact_control *cc)
if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
+ /*
+ * When defrag_mode is enabled, make kcompactd target
+ * watermarks in whole pageblocks. Because they can be stolen
+ * without polluting, no further fallback checks are needed.
+ */
+ if (defrag_mode && !cc->direct_compaction) {
+ if (__zone_watermark_ok(cc->zone, cc->order,
+ high_wmark_pages(cc->zone),
+ cc->highest_zoneidx, cc->alloc_flags,
+ zone_page_state(cc->zone,
+ NR_FREE_PAGES_BLOCKS)))
+ return COMPACT_SUCCESS;
+
+ return COMPACT_CONTINUE;
+ }
+
/* Direct compactor: Is a suitable page free? */
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &cc->zone->free_area[order];
- bool can_steal;
/* Job done if page is free of the right migratetype */
if (!free_area_empty(area, migratetype))
@@ -2345,8 +2359,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
- if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
+ if (find_suitable_fallback(area, order, migratetype, true) >= 0)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
@@ -2378,40 +2391,42 @@ static enum compact_result compact_finished(struct compact_control *cc)
}
static bool __compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx,
- unsigned long wmark_target)
+ unsigned long watermark, int highest_zoneidx,
+ unsigned long free_pages)
{
- unsigned long watermark;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
- * watermark and alloc_flags have to match, or be more pessimistic than
- * the check in __isolate_free_page(). We don't use the direct
- * compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's highest_zoneidx
- * to skip over zones where lowmem reserves would prevent allocation
- * even if compaction succeeds.
- * For costly orders, we require low watermark instead of min for
- * compaction to proceed to increase its chances.
+ * watermark have to match, or be more pessimistic than the check in
+ * __isolate_free_page().
+ *
+ * For costly orders, we require a higher watermark for compaction to
+ * proceed to increase its chances.
+ *
+ * We use the direct compactor's highest_zoneidx to skip over zones
+ * where lowmem reserves would prevent allocation even if compaction
+ * succeeds.
+ *
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * suitable migration targets.
*/
- watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
- low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ watermark += low_wmark_pages(zone) - min_wmark_pages(zone);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ ALLOC_CMA, free_pages);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, unsigned long watermark,
+ int highest_zoneidx)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, watermark, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2449,6 +2464,7 @@ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
return suitable;
}
+/* Used by direct reclaimers */
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
int alloc_flags)
{
@@ -2471,8 +2487,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
- if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ if (__compaction_suitable(zone, order, min_wmark_pages(zone),
+ ac->highest_zoneidx, available))
return true;
}
@@ -2488,16 +2504,40 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
static enum compact_result
compaction_suit_allocation_order(struct zone *zone, unsigned int order,
- int highest_zoneidx, unsigned int alloc_flags)
+ int highest_zoneidx, unsigned int alloc_flags,
+ bool async, bool kcompactd)
{
+ unsigned long free_pages;
unsigned long watermark;
+ if (kcompactd && defrag_mode)
+ free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ else
+ free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
- if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
- alloc_flags))
+ if (__zone_watermark_ok(zone, order, watermark, highest_zoneidx,
+ alloc_flags, free_pages))
return COMPACT_SUCCESS;
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ /*
+ * For unmovable allocations (without ALLOC_CMA), check if there is enough
+ * free memory in the non-CMA pageblocks. Otherwise compaction could form
+ * the high-order page in CMA pageblocks, which would not help the
+ * allocation to succeed. However, limit the check to costly order async
+ * compaction (such as opportunistic THP attempts) because there is the
+ * possibility that compaction would migrate pages from non-CMA to CMA
+ * pageblock.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && async &&
+ !(alloc_flags & ALLOC_CMA)) {
+ if (!__zone_watermark_ok(zone, 0, watermark + compact_gap(order),
+ highest_zoneidx, 0,
+ zone_page_state(zone, NR_FREE_PAGES)))
+ return COMPACT_SKIPPED;
+ }
+
+ if (!compaction_suitable(zone, order, watermark, highest_zoneidx))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
@@ -2532,7 +2572,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
if (!is_via_compact_memory(cc->order)) {
ret = compaction_suit_allocation_order(cc->zone, cc->order,
cc->highest_zoneidx,
- cc->alloc_flags);
+ cc->alloc_flags,
+ cc->mode == MIGRATE_ASYNC,
+ !cc->direct_compaction);
if (ret != COMPACT_CONTINUE)
return ret;
}
@@ -3026,6 +3068,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
struct zone *zone;
enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
enum compact_result ret;
+ unsigned int alloc_flags = defrag_mode ?
+ ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN;
for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
@@ -3035,7 +3079,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
ret = compaction_suit_allocation_order(zone,
pgdat->kcompactd_max_order,
- highest_zoneidx, ALLOC_WMARK_MIN);
+ highest_zoneidx, alloc_flags,
+ false, true);
if (ret == COMPACT_CONTINUE)
return true;
}
@@ -3058,6 +3103,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
+ .alloc_flags = defrag_mode ? ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN,
};
enum compact_result ret;
@@ -3076,7 +3122,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
continue;
ret = compaction_suit_allocation_order(zone,
- cc.order, zoneid, ALLOC_WMARK_MIN);
+ cc.order, zoneid, cc.alloc_flags,
+ false, true);
if (ret != COMPACT_CONTINUE)
continue;
@@ -3154,15 +3201,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t *)p;
- struct task_struct *tsk = current;
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
-
+ current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@@ -3219,6 +3261,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
+ current->flags &= ~PF_KCOMPACTD;
+
return 0;
}
@@ -3233,10 +3277,12 @@ void __meminit kcompactd_run(int nid)
if (pgdat->kcompactd)
return;
- pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
pgdat->kcompactd = NULL;
+ } else {
+ wake_up_process(pgdat->kcompactd);
}
}
@@ -3254,30 +3300,6 @@ void __meminit kcompactd_stop(int nid)
}
}
-/*
- * It's optimal to keep kcompactd on the same CPUs as their memory, but
- * not required for correctness. So if the last cpu in a node goes
- * away, we get changed to run anywhere: as the first one comes back,
- * restore their cpu bindings.
- */
-static int kcompactd_cpu_online(unsigned int cpu)
-{
- int nid;
-
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
-
- mask = cpumask_of_node(pgdat->node_id);
-
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- if (pgdat->kcompactd)
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
- }
- return 0;
-}
-
static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
int write, void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3297,7 +3319,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
return ret;
}
-static struct ctl_table vm_compaction[] = {
+static const struct ctl_table vm_compaction[] = {
{
.procname = "compact_memory",
.data = &sysctl_compact_memory,
@@ -3337,15 +3359,6 @@ static struct ctl_table vm_compaction[] = {
static int __init kcompactd_init(void)
{
int nid;
- int ret;
-
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "mm/compaction:online",
- kcompactd_cpu_online, NULL);
- if (ret < 0) {
- pr_err("kcompactd: failed to register hotplug callbacks.\n");
- return ret;
- }
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index d0357f3e9372..c93d0c56b963 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -28,6 +28,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@@ -36,6 +37,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
+ default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
@@ -71,36 +74,6 @@ config DAMON_SYSFS_KUNIT_TEST
If unsure, say N.
-config DAMON_DBGFS_DEPRECATED
- bool "DAMON debugfs interface (DEPRECATED!)"
- depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
- help
- This builds the debugfs interface for DAMON. The user space admins
- can use the interface for arbitrary data access monitoring.
-
- If unsure, say N.
-
- This is deprecated, so users should move to the sysfs interface
- (DAMON_SYSFS). If you depend on this and cannot move, please report
- your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
-
-config DAMON_DBGFS
- bool
- default y
- depends on DAMON_DBGFS_DEPRECATED
-
-config DAMON_DBGFS_KUNIT_TEST
- bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
- depends on DAMON_DBGFS && KUNIT=y
- default KUNIT_ALL_TESTS
- help
- This builds the DAMON debugfs interface Kunit test suite.
-
- For more information on KUnit and unit tests in general, please refer
- to the KUnit documentation.
-
- If unsure, say N.
-
config DAMON_RECLAIM
bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
depends on DAMON_PADDR
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f7add3f4aa79..8b49012ba8c3 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -4,6 +4,5 @@ obj-y := core.o
obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
-obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8b8e2933dcd4..b217e0120e09 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -14,6 +14,7 @@
#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -75,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops)
if (ops->id >= NR_DAMON_OPS)
return -EINVAL;
+
mutex_lock(&damon_ops_lock);
/* Fail for already registered ops */
- if (__damon_is_registered_ops(ops->id)) {
+ if (__damon_is_registered_ops(ops->id))
err = -EINVAL;
- goto out;
- }
- damon_registered_ops[ops->id] = *ops;
-out:
+ else
+ damon_registered_ops[ops->id] = *ops;
mutex_unlock(&damon_ops_lock);
return err;
}
@@ -266,7 +266,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
}
struct damos_filter *damos_new_filter(enum damos_filter_type type,
- bool matching)
+ bool matching, bool allow)
{
struct damos_filter *filter;
@@ -275,13 +275,36 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
return NULL;
filter->type = type;
filter->matching = matching;
+ filter->allow = allow;
INIT_LIST_HEAD(&filter->list);
return filter;
}
+/**
+ * damos_filter_for_ops() - Return if the filter is ops-hndled one.
+ * @type: type of the filter.
+ *
+ * Return: true if the filter of @type needs to be handled by ops layer, false
+ * otherwise.
+ */
+bool damos_filter_for_ops(enum damos_filter_type type)
+{
+ switch (type) {
+ case DAMOS_FILTER_TYPE_ADDR:
+ case DAMOS_FILTER_TYPE_TARGET:
+ return false;
+ default:
+ break;
+ }
+ return true;
+}
+
void damos_add_filter(struct damos *s, struct damos_filter *f)
{
- list_add_tail(&f->list, &s->filters);
+ if (damos_filter_for_ops(f->type))
+ list_add_tail(&f->list, &s->ops_filters);
+ else
+ list_add_tail(&f->list, &s->filters);
}
static void damos_del_filter(struct damos_filter *f)
@@ -371,7 +394,9 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
* or damon_attrs are updated.
*/
scheme->next_apply_sis = 0;
+ scheme->walk_completed = false;
INIT_LIST_HEAD(&scheme->filters);
+ INIT_LIST_HEAD(&scheme->ops_filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -499,11 +524,13 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
ctx->passed_sample_intervals = 0;
- /* These will be set from kdamond_init_intervals_sis() */
+ /* These will be set from kdamond_init_ctx() */
ctx->next_aggregation_sis = 0;
ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
+ mutex_init(&ctx->call_control_lock);
+ mutex_init(&ctx->walk_control_lock);
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
@@ -575,11 +602,25 @@ static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
}
static void damon_update_monitoring_result(struct damon_region *r,
- struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
-{
- r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
- old_attrs, new_attrs);
- r->nr_accesses_bp = r->nr_accesses * 10000;
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs,
+ bool aggregating)
+{
+ if (!aggregating) {
+ r->nr_accesses = damon_nr_accesses_for_new_attrs(
+ r->nr_accesses, old_attrs, new_attrs);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+ } else {
+ /*
+ * if this is called in the middle of the aggregation, reset
+ * the aggregations we made so far for this aggregation
+ * interval. In other words, make the status like
+ * kdamond_reset_aggregated() is called.
+ */
+ r->last_nr_accesses = damon_nr_accesses_for_new_attrs(
+ r->last_nr_accesses, old_attrs, new_attrs);
+ r->nr_accesses_bp = r->last_nr_accesses * 10000;
+ r->nr_accesses = 0;
+ }
r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
}
@@ -592,7 +633,7 @@ static void damon_update_monitoring_result(struct damon_region *r,
* ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs.
*/
static void damon_update_monitoring_results(struct damon_ctx *ctx,
- struct damon_attrs *new_attrs)
+ struct damon_attrs *new_attrs, bool aggregating)
{
struct damon_attrs *old_attrs = &ctx->attrs;
struct damon_target *t;
@@ -607,7 +648,26 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
damon_for_each_target(t, ctx)
damon_for_each_region(r, t)
damon_update_monitoring_result(
- r, old_attrs, new_attrs);
+ r, old_attrs, new_attrs, aggregating);
+}
+
+/*
+ * damon_valid_intervals_goal() - return if the intervals goal of @attrs is
+ * valid.
+ */
+static bool damon_valid_intervals_goal(struct damon_attrs *attrs)
+{
+ struct damon_intervals_goal *goal = &attrs->intervals_goal;
+
+ /* tuning is disabled */
+ if (!goal->aggrs)
+ return true;
+ if (goal->min_sample_us > goal->max_sample_us)
+ return false;
+ if (attrs->sample_interval < goal->min_sample_us ||
+ goal->max_sample_us < attrs->sample_interval)
+ return false;
+ return true;
}
/**
@@ -615,10 +675,10 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
* @ctx: monitoring context
* @attrs: monitoring attributes
*
- * This function should be called while the kdamond is not running, or an
- * access check results aggregation is not ongoing (e.g., from
- * &struct damon_callback->after_aggregation or
- * &struct damon_callback->after_wmarks_check callbacks).
+ * This function should be called while the kdamond is not running, an access
+ * check results aggregation is not ongoing (e.g., from &struct
+ * damon_callback->after_aggregation or &struct
+ * damon_callback->after_wmarks_check callbacks), or from damon_call().
*
* Every time interval is in micro-seconds.
*
@@ -629,6 +689,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
unsigned long sample_interval = attrs->sample_interval ?
attrs->sample_interval : 1;
struct damos *s;
+ bool aggregating = ctx->passed_sample_intervals <
+ ctx->next_aggregation_sis;
+
+ if (!damon_valid_intervals_goal(attrs))
+ return -EINVAL;
if (attrs->min_nr_regions < 3)
return -EINVAL;
@@ -637,12 +702,16 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
if (attrs->sample_interval > attrs->aggr_interval)
return -EINVAL;
+ /* calls from core-external doesn't set this. */
+ if (!attrs->aggr_samples)
+ attrs->aggr_samples = attrs->aggr_interval / sample_interval;
+
ctx->next_aggregation_sis = ctx->passed_sample_intervals +
attrs->aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->passed_sample_intervals +
attrs->ops_update_interval / sample_interval;
- damon_update_monitoring_results(ctx, attrs);
+ damon_update_monitoring_results(ctx, attrs, aggregating);
ctx->attrs = *attrs;
damon_for_each_scheme(s, ctx)
@@ -772,6 +841,9 @@ static void damos_commit_filter_arg(
case DAMOS_FILTER_TYPE_TARGET:
dst->target_idx = src->target_idx;
break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ dst->sz_range = src->sz_range;
+ break;
default:
break;
}
@@ -785,7 +857,7 @@ static void damos_commit_filter(
damos_commit_filter_arg(dst, src);
}
-static int damos_commit_filters(struct damos *dst, struct damos *src)
+static int damos_commit_core_filters(struct damos *dst, struct damos *src)
{
struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
int i = 0, j = 0;
@@ -803,7 +875,36 @@ static int damos_commit_filters(struct damos *dst, struct damos *src)
continue;
new_filter = damos_new_filter(
- src_filter->type, src_filter->matching);
+ src_filter->type, src_filter->matching,
+ src_filter->allow);
+ if (!new_filter)
+ return -ENOMEM;
+ damos_commit_filter_arg(new_filter, src_filter);
+ damos_add_filter(dst, new_filter);
+ }
+ return 0;
+}
+
+static int damos_commit_ops_filters(struct damos *dst, struct damos *src)
+{
+ struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
+ int i = 0, j = 0;
+
+ damos_for_each_ops_filter_safe(dst_filter, next, dst) {
+ src_filter = damos_nth_filter(i++, src);
+ if (src_filter)
+ damos_commit_filter(dst_filter, src_filter);
+ else
+ damos_destroy_filter(dst_filter);
+ }
+
+ damos_for_each_ops_filter_safe(src_filter, next, src) {
+ if (j++ < i)
+ continue;
+
+ new_filter = damos_new_filter(
+ src_filter->type, src_filter->matching,
+ src_filter->allow);
if (!new_filter)
return -ENOMEM;
damos_commit_filter_arg(new_filter, src_filter);
@@ -812,6 +913,46 @@ static int damos_commit_filters(struct damos *dst, struct damos *src)
return 0;
}
+/**
+ * damos_filters_default_reject() - decide whether to reject memory that didn't
+ * match with any given filter.
+ * @filters: Given DAMOS filters of a group.
+ */
+static bool damos_filters_default_reject(struct list_head *filters)
+{
+ struct damos_filter *last_filter;
+
+ if (list_empty(filters))
+ return false;
+ last_filter = list_last_entry(filters, struct damos_filter, list);
+ return last_filter->allow;
+}
+
+static void damos_set_filters_default_reject(struct damos *s)
+{
+ if (!list_empty(&s->ops_filters))
+ s->core_filters_default_reject = false;
+ else
+ s->core_filters_default_reject =
+ damos_filters_default_reject(&s->filters);
+ s->ops_filters_default_reject =
+ damos_filters_default_reject(&s->ops_filters);
+}
+
+static int damos_commit_filters(struct damos *dst, struct damos *src)
+{
+ int err;
+
+ err = damos_commit_core_filters(dst, src);
+ if (err)
+ return err;
+ err = damos_commit_ops_filters(dst, src);
+ if (err)
+ return err;
+ damos_set_filters_default_reject(dst);
+ return 0;
+}
+
static struct damos *damon_nth_scheme(int n, struct damon_ctx *ctx)
{
struct damos *s;
@@ -868,6 +1009,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src)
NUMA_NO_NODE);
if (!new_scheme)
return -ENOMEM;
+ err = damos_commit(new_scheme, src_scheme);
+ if (err) {
+ damon_destroy_scheme(new_scheme);
+ return err;
+ }
damon_add_scheme(dst, new_scheme);
}
return 0;
@@ -947,9 +1093,17 @@ static int damon_commit_targets(
if (err)
return err;
} else {
+ struct damos *s;
+
if (damon_target_has_pid(dst))
put_pid(dst_target->pid);
damon_destroy_target(dst_target);
+ damon_for_each_scheme(s, dst) {
+ if (s->quota.charge_target_from == dst_target) {
+ s->quota.charge_target_from = NULL;
+ s->quota.charge_addr_from = 0;
+ }
+ }
}
}
@@ -961,8 +1115,11 @@ static int damon_commit_targets(
return -ENOMEM;
err = damon_commit_target(new_target, false,
src_target, damon_target_has_pid(src));
- if (err)
+ if (err) {
+ damon_destroy_target(new_target);
return err;
+ }
+ damon_add_target(dst, new_target);
}
return 0;
}
@@ -1154,6 +1311,107 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
return err;
}
+static bool damon_is_running(struct damon_ctx *ctx)
+{
+ bool running;
+
+ mutex_lock(&ctx->kdamond_lock);
+ running = ctx->kdamond != NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+ return running;
+}
+
+/**
+ * damon_call() - Invoke a given function on DAMON worker thread (kdamond).
+ * @ctx: DAMON context to call the function for.
+ * @control: Control variable of the call request.
+ *
+ * Ask DAMON worker thread (kdamond) of @ctx to call a function with an
+ * argument data that respectively passed via &damon_call_control->fn and
+ * &damon_call_control->data of @control, and wait until the kdamond finishes
+ * handling of the request.
+ *
+ * The kdamond executes the function with the argument in the main loop, just
+ * after a sampling of the iteration is finished. The function can hence
+ * safely access the internal data of the &struct damon_ctx without additional
+ * synchronization. The return value of the function will be saved in
+ * &damon_call_control->return_code.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
+{
+ init_completion(&control->completion);
+ control->canceled = false;
+
+ mutex_lock(&ctx->call_control_lock);
+ if (ctx->call_control) {
+ mutex_unlock(&ctx->call_control_lock);
+ return -EBUSY;
+ }
+ ctx->call_control = control;
+ mutex_unlock(&ctx->call_control_lock);
+ if (!damon_is_running(ctx))
+ return -EINVAL;
+ wait_for_completion(&control->completion);
+ if (control->canceled)
+ return -ECANCELED;
+ return 0;
+}
+
+/**
+ * damos_walk() - Invoke a given functions while DAMOS walk regions.
+ * @ctx: DAMON context to call the functions for.
+ * @control: Control variable of the walk request.
+ *
+ * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region
+ * that the kdamond will apply DAMOS action to, and wait until the kdamond
+ * finishes handling of the request.
+ *
+ * The kdamond executes the given function in the main loop, for each region
+ * just after it applied any DAMOS actions of @ctx to it. The invocation is
+ * made only within one &damos->apply_interval_us since damos_walk()
+ * invocation, for each scheme. The given callback function can hence safely
+ * access the internal data of &struct damon_ctx and &struct damon_region that
+ * each of the scheme will apply the action for next interval, without
+ * additional synchronizations against the kdamond. If every scheme of @ctx
+ * passed at least one &damos->apply_interval_us, kdamond marks the request as
+ * completed so that damos_walk() can wakeup and return.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
+{
+ init_completion(&control->completion);
+ control->canceled = false;
+ mutex_lock(&ctx->walk_control_lock);
+ if (ctx->walk_control) {
+ mutex_unlock(&ctx->walk_control_lock);
+ return -EBUSY;
+ }
+ ctx->walk_control = control;
+ mutex_unlock(&ctx->walk_control_lock);
+ if (!damon_is_running(ctx))
+ return -EINVAL;
+ wait_for_completion(&control->completion);
+ if (control->canceled)
+ return -ECANCELED;
+ return 0;
+}
+
+/*
+ * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing
+ * the problem being propagated.
+ */
+static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r)
+{
+ if (r->nr_accesses_bp == r->nr_accesses * 10000)
+ return;
+ WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n",
+ r->nr_accesses_bp, r->nr_accesses);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+}
+
/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
@@ -1167,6 +1425,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
damon_for_each_region(r, t) {
trace_damon_aggregated(ti, r, damon_nr_regions(t));
+ damon_warn_fix_nr_accesses_corruption(r);
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
@@ -1174,6 +1433,65 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
}
}
+static unsigned long damon_get_intervals_score(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long sz_region, max_access_events = 0, access_events = 0;
+ unsigned long target_access_events;
+ unsigned long goal_bp = c->attrs.intervals_goal.access_bp;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ sz_region = damon_sz_region(r);
+ max_access_events += sz_region * c->attrs.aggr_samples;
+ access_events += sz_region * r->nr_accesses;
+ }
+ }
+ target_access_events = max_access_events * goal_bp / 10000;
+ return access_events * 10000 / target_access_events;
+}
+
+static unsigned long damon_feed_loop_next_input(unsigned long last_input,
+ unsigned long score);
+
+static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c)
+{
+ unsigned long score_bp, adaptation_bp;
+
+ score_bp = damon_get_intervals_score(c);
+ adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) /
+ 10000;
+ /*
+ * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of
+ * the intervals by rescaling [1,10,000] to [5000, 10,000].
+ */
+ if (adaptation_bp <= 10000)
+ adaptation_bp = 5000 + adaptation_bp / 2;
+ return adaptation_bp;
+}
+
+static void kdamond_tune_intervals(struct damon_ctx *c)
+{
+ unsigned long adaptation_bp;
+ struct damon_attrs new_attrs;
+ struct damon_intervals_goal *goal;
+
+ adaptation_bp = damon_get_intervals_adaptation_bp(c);
+ if (adaptation_bp == 10000)
+ return;
+
+ new_attrs = c->attrs;
+ goal = &c->attrs.intervals_goal;
+ new_attrs.sample_interval = min(goal->max_sample_us,
+ c->attrs.sample_interval * adaptation_bp / 10000);
+ new_attrs.sample_interval = max(goal->min_sample_us,
+ new_attrs.sample_interval);
+ new_attrs.aggr_interval = new_attrs.sample_interval *
+ c->attrs.aggr_samples;
+ damon_set_attrs(c, &new_attrs);
+}
+
static void damon_split_region_at(struct damon_target *t,
struct damon_region *r, unsigned long sz_r);
@@ -1264,16 +1582,18 @@ static bool damos_skip_charged_region(struct damon_target *t,
}
static void damos_update_stat(struct damos *s,
- unsigned long sz_tried, unsigned long sz_applied)
+ unsigned long sz_tried, unsigned long sz_applied,
+ unsigned long sz_ops_filter_passed)
{
s->stat.nr_tried++;
s->stat.sz_tried += sz_tried;
if (sz_applied)
s->stat.nr_applied++;
s->stat.sz_applied += sz_applied;
+ s->stat.sz_ops_filter_passed += sz_ops_filter_passed;
}
-static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
struct damon_region *r, struct damos_filter *filter)
{
bool matched = false;
@@ -1326,11 +1646,102 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_filter *filter;
+ s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (__damos_filter_out(ctx, t, r, filter))
- return true;
+ if (damos_filter_match(ctx, t, r, filter)) {
+ if (filter->allow)
+ s->core_filters_allowed = true;
+ return !filter->allow;
+ }
}
- return false;
+ return s->core_filters_default_reject;
+}
+
+/*
+ * damos_walk_call_walk() - Call &damos_walk_control->walk_fn.
+ * @ctx: The context of &damon_ctx->walk_control.
+ * @t: The monitoring target of @r that @s will be applied.
+ * @r: The region of @t that @s will be applied.
+ * @s: The scheme of @ctx that will be applied to @r.
+ *
+ * This function is called from kdamond whenever it asked the operation set to
+ * apply a DAMOS scheme action to a region. If a DAMOS walk request is
+ * installed by damos_walk() and not yet uninstalled, invoke it.
+ */
+static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s,
+ unsigned long sz_filter_passed)
+{
+ struct damos_walk_control *control;
+
+ if (s->walk_completed)
+ return;
+
+ control = ctx->walk_control;
+ if (!control)
+ return;
+
+ control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed);
+}
+
+/*
+ * damos_walk_complete() - Complete DAMOS walk request if all walks are done.
+ * @ctx: The context of &damon_ctx->walk_control.
+ * @s: A scheme of @ctx that all walks are now done.
+ *
+ * This function is called when kdamond finished applying the action of a DAMOS
+ * scheme to all regions that eligible for the given &damos->apply_interval_us.
+ * If every scheme of @ctx including @s now finished walking for at least one
+ * &damos->apply_interval_us, this function makrs the handling of the given
+ * DAMOS walk request is done, so that damos_walk() can wake up and return.
+ */
+static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s)
+{
+ struct damos *siter;
+ struct damos_walk_control *control;
+
+ control = ctx->walk_control;
+ if (!control)
+ return;
+
+ s->walk_completed = true;
+ /* if all schemes completed, signal completion to walker */
+ damon_for_each_scheme(siter, ctx) {
+ if (!siter->walk_completed)
+ return;
+ }
+ damon_for_each_scheme(siter, ctx)
+ siter->walk_completed = false;
+
+ complete(&control->completion);
+ ctx->walk_control = NULL;
+}
+
+/*
+ * damos_walk_cancel() - Cancel the current DAMOS walk request.
+ * @ctx: The context of &damon_ctx->walk_control.
+ *
+ * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS
+ * walk is requested but there is no DAMOS scheme to walk for, or the kdamond
+ * is already out of the main loop and therefore gonna be terminated, and hence
+ * cannot continue the walks. This function therefore marks the walk request
+ * as canceled, so that damos_walk() can wake up and return.
+ */
+static void damos_walk_cancel(struct damon_ctx *ctx)
+{
+ struct damos_walk_control *control;
+
+ mutex_lock(&ctx->walk_control_lock);
+ control = ctx->walk_control;
+ mutex_unlock(&ctx->walk_control_lock);
+
+ if (!control)
+ return;
+ control->canceled = true;
+ complete(&control->completion);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control = NULL;
+ mutex_unlock(&ctx->walk_control_lock);
}
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
@@ -1340,7 +1751,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
- int err = 0;
+ unsigned long sz_ops_filter_passed = 0;
/*
* We plan to support multiple context per kdamond, as DAMON sysfs
* implies with 'nr_contexts' file. Nevertheless, only single context
@@ -1380,13 +1791,11 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (damos_filter_out(c, t, r, s))
return;
ktime_get_coarse_ts64(&begin);
- if (c->callback.before_damos_apply)
- err = c->callback.before_damos_apply(c, t, r, s);
- if (!err) {
- trace_damos_before_apply(cidx, sidx, tidx, r,
- damon_nr_regions(t), do_trace);
- sz_applied = c->ops.apply_scheme(c, t, r, s);
- }
+ trace_damos_before_apply(cidx, sidx, tidx, r,
+ damon_nr_regions(t), do_trace);
+ sz_applied = c->ops.apply_scheme(c, t, r, s,
+ &sz_ops_filter_passed);
+ damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
@@ -1400,7 +1809,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
r->age = 0;
update_stat:
- damos_update_stat(s, sz, sz_applied);
+ damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed);
}
static void damon_do_apply_schemes(struct damon_ctx *c,
@@ -1502,6 +1911,29 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
+#ifdef CONFIG_NUMA
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ struct sysinfo i;
+ __kernel_ulong_t numerator;
+
+ si_meminfo_node(&i, goal->nid);
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ numerator = i.totalram - i.freeram;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ numerator = i.freeram;
+ return numerator * 10000 / i.totalram;
+}
+#else
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ return 0;
+}
+#endif
+
+
static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -1515,6 +1947,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = now_psi_total - goal->last_psi_total;
goal->last_psi_total = now_psi_total;
break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->current_value = damos_get_node_mem_bp(goal);
+ break;
default:
break;
}
@@ -1542,7 +1978,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
static void damos_set_effective_quota(struct damos_quota *quota)
{
unsigned long throughput;
- unsigned long esz;
+ unsigned long esz = ULONG_MAX;
if (!quota->ms && list_empty(&quota->goals)) {
quota->esz = quota->sz;
@@ -1564,10 +2000,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->total_charged_ns;
else
throughput = PAGE_SIZE * 1024;
- if (!list_empty(&quota->goals))
- esz = min(throughput * quota->ms, esz);
- else
- esz = throughput * quota->ms;
+ esz = min(throughput * quota->ms, esz);
}
if (quota->sz && quota->sz < esz)
@@ -1650,6 +2083,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (!has_schemes_to_apply)
return;
+ mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
@@ -1658,10 +2092,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
+ damos_walk_complete(c, s);
s->next_apply_sis = c->passed_sample_intervals +
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
+ s->last_applied = NULL;
}
+ mutex_unlock(&c->walk_control_lock);
}
/*
@@ -1886,9 +2323,8 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
if (scheme->wmarks.activated)
pr_debug("deactivate a scheme (%d) for %s wmark\n",
- scheme->action,
- metric > scheme->wmarks.high ?
- "high" : "low");
+ scheme->action,
+ str_high_low(metric > scheme->wmarks.high));
scheme->wmarks.activated = false;
return scheme->wmarks.interval;
}
@@ -1912,6 +2348,39 @@ static void kdamond_usleep(unsigned long usecs)
usleep_range_idle(usecs, usecs + 1);
}
+/*
+ * kdamond_call() - handle damon_call_control.
+ * @ctx: The &struct damon_ctx of the kdamond.
+ * @cancel: Whether to cancel the invocation of the function.
+ *
+ * If there is a &struct damon_call_control request that registered via
+ * &damon_call() on @ctx, do or cancel the invocation of the function depending
+ * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS
+ * watermarks, or the kdamond is already out of the main loop and therefore
+ * will be terminated.
+ */
+static void kdamond_call(struct damon_ctx *ctx, bool cancel)
+{
+ struct damon_call_control *control;
+ int ret = 0;
+
+ mutex_lock(&ctx->call_control_lock);
+ control = ctx->call_control;
+ mutex_unlock(&ctx->call_control_lock);
+ if (!control)
+ return;
+ if (cancel) {
+ control->canceled = true;
+ } else {
+ ret = control->fn(control->data);
+ control->return_code = ret;
+ }
+ complete(&control->completion);
+ mutex_lock(&ctx->call_control_lock);
+ ctx->call_control = NULL;
+ mutex_unlock(&ctx->call_control_lock);
+}
+
/* Returns negative error code if it's not activated but should return */
static int kdamond_wait_activation(struct damon_ctx *ctx)
{
@@ -1936,11 +2405,13 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
if (ctx->callback.after_wmarks_check &&
ctx->callback.after_wmarks_check(ctx))
break;
+ kdamond_call(ctx, true);
+ damos_walk_cancel(ctx);
}
return -EBUSY;
}
-static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+static void kdamond_init_ctx(struct damon_ctx *ctx)
{
unsigned long sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
@@ -1951,11 +2422,14 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
sample_interval;
+ ctx->next_intervals_tune_sis = ctx->next_aggregation_sis *
+ ctx->attrs.intervals_goal.aggrs;
damon_for_each_scheme(scheme, ctx) {
apply_interval = scheme->apply_interval_us ?
scheme->apply_interval_us : ctx->attrs.aggr_interval;
scheme->next_apply_sis = apply_interval / sample_interval;
+ damos_set_filters_default_reject(scheme);
}
}
@@ -1973,12 +2447,10 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
complete(&ctx->kdamond_started);
- kdamond_init_intervals_sis(ctx);
+ kdamond_init_ctx(ctx);
if (ctx->ops.init)
ctx->ops.init(ctx);
- if (ctx->callback.before_start && ctx->callback.before_start(ctx))
- goto done;
ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1,
sizeof(*ctx->regions_score_histogram), GFP_KERNEL);
if (!ctx->regions_score_histogram)
@@ -2003,9 +2475,6 @@ static int kdamond_fn(void *data)
if (ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
- if (ctx->callback.after_sampling &&
- ctx->callback.after_sampling(ctx))
- break;
kdamond_usleep(sample_interval);
ctx->passed_sample_intervals++;
@@ -2023,22 +2492,52 @@ static int kdamond_fn(void *data)
}
/*
- * do kdamond_apply_schemes() after kdamond_merge_regions() if
- * possible, to reduce overhead
+ * do kdamond_call() and kdamond_apply_schemes() after
+ * kdamond_merge_regions() if possible, to reduce overhead
*/
+ kdamond_call(ctx, false);
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
+ else
+ damos_walk_cancel(ctx);
sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+ if (ctx->attrs.intervals_goal.aggrs &&
+ ctx->passed_sample_intervals >=
+ ctx->next_intervals_tune_sis) {
+ /*
+ * ctx->next_aggregation_sis might be updated
+ * from kdamond_call(). In the case,
+ * damon_set_attrs() which will be called from
+ * kdamond_tune_interval() may wrongly think
+ * this is in the middle of the current
+ * aggregation, and make aggregation
+ * information reset for all regions. Then,
+ * following kdamond_reset_aggregated() call
+ * will make the region information invalid,
+ * particularly for ->nr_accesses_bp.
+ *
+ * Reset ->next_aggregation_sis to avoid that.
+ * It will anyway correctly updated after this
+ * if caluse.
+ */
+ ctx->next_aggregation_sis =
+ next_aggregation_sis;
+ ctx->next_intervals_tune_sis +=
+ ctx->attrs.aggr_samples *
+ ctx->attrs.intervals_goal.aggrs;
+ kdamond_tune_intervals(ctx);
+ sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+
+ }
ctx->next_aggregation_sis = next_aggregation_sis +
ctx->attrs.aggr_interval / sample_interval;
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
- if (ctx->ops.reset_aggregated)
- ctx->ops.reset_aggregated(ctx);
}
if (ctx->passed_sample_intervals >= next_ops_update_sis) {
@@ -2067,6 +2566,9 @@ done:
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
+ kdamond_call(ctx, true);
+ damos_walk_cancel(ctx);
+
mutex_lock(&damon_lock);
nr_running_ctxs--;
if (!nr_running_ctxs && running_exclusive_ctxs)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
deleted file mode 100644
index b4213bc47e44..000000000000
--- a/mm/damon/dbgfs.c
+++ /dev/null
@@ -1,1148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DAMON Debugfs Interface
- *
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#define pr_fmt(fmt) "damon-dbgfs: " fmt
-
-#include <linux/damon.h>
-#include <linux/debugfs.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/page_idle.h>
-#include <linux/slab.h>
-
-#define DAMON_DBGFS_DEPRECATION_NOTICE \
- "DAMON debugfs interface is deprecated, so users should move " \
- "to DAMON_SYSFS. If you cannot, please report your usecase to " \
- "damon@lists.linux.dev and linux-mm@kvack.org.\n"
-
-static struct damon_ctx **dbgfs_ctxs;
-static int dbgfs_nr_ctxs;
-static struct dentry **dbgfs_dirs;
-static DEFINE_MUTEX(damon_dbgfs_lock);
-
-static void damon_dbgfs_warn_deprecation(void)
-{
- pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
-}
-
-/*
- * Returns non-empty string on success, negative error code otherwise.
- */
-static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- ssize_t ret;
-
- /* We do not accept continuous write */
- if (*ppos)
- return ERR_PTR(-EINVAL);
-
- kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return ERR_PTR(-ENOMEM);
-
- ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count);
- if (ret != count) {
- kfree(kbuf);
- return ERR_PTR(-EIO);
- }
- kbuf[ret] = '\0';
-
- return kbuf;
-}
-
-static ssize_t dbgfs_attrs_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char kbuf[128];
- int ret;
-
- mutex_lock(&ctx->kdamond_lock);
- ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
- ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
- ctx->attrs.ops_update_interval,
- ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
- mutex_unlock(&ctx->kdamond_lock);
-
- return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
-}
-
-static ssize_t dbgfs_attrs_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- struct damon_attrs attrs;
- char *kbuf;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
- &attrs.sample_interval, &attrs.aggr_interval,
- &attrs.ops_update_interval,
- &attrs.min_nr_regions,
- &attrs.max_nr_regions) != 5) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- ret = damon_set_attrs(ctx, &attrs);
- if (!ret)
- ret = count;
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
-out:
- kfree(kbuf);
- return ret;
-}
-
-/*
- * Return corresponding dbgfs' scheme action value (int) for the given
- * damos_action if the given damos_action value is valid and supported by
- * dbgfs, negative error code otherwise.
- */
-static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
-{
- switch (action) {
- case DAMOS_WILLNEED:
- return 0;
- case DAMOS_COLD:
- return 1;
- case DAMOS_PAGEOUT:
- return 2;
- case DAMOS_HUGEPAGE:
- return 3;
- case DAMOS_NOHUGEPAGE:
- return 4;
- case DAMOS_STAT:
- return 5;
- default:
- return -EINVAL;
- }
-}
-
-static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
-{
- struct damos *s;
- int written = 0;
- int rc;
-
- damon_for_each_scheme(s, c) {
- rc = scnprintf(&buf[written], len - written,
- "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
- s->pattern.min_sz_region,
- s->pattern.max_sz_region,
- s->pattern.min_nr_accesses,
- s->pattern.max_nr_accesses,
- s->pattern.min_age_region,
- s->pattern.max_age_region,
- damos_action_to_dbgfs_scheme_action(s->action),
- s->quota.ms, s->quota.sz,
- s->quota.reset_interval,
- s->quota.weight_sz,
- s->quota.weight_nr_accesses,
- s->quota.weight_age,
- s->wmarks.metric, s->wmarks.interval,
- s->wmarks.high, s->wmarks.mid, s->wmarks.low,
- s->stat.nr_tried, s->stat.sz_tried,
- s->stat.nr_applied, s->stat.sz_applied,
- s->stat.qt_exceeds);
- if (!rc)
- return -ENOMEM;
-
- written += rc;
- }
- return written;
-}
-
-static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- len = sprint_schemes(ctx, kbuf, count);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
-{
- ssize_t i;
-
- for (i = 0; i < nr_schemes; i++)
- kfree(schemes[i]);
- kfree(schemes);
-}
-
-/*
- * Return corresponding damos_action for the given dbgfs input for a scheme
- * action if the input is valid, negative error code otherwise.
- */
-static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
-{
- switch (dbgfs_action) {
- case 0:
- return DAMOS_WILLNEED;
- case 1:
- return DAMOS_COLD;
- case 2:
- return DAMOS_PAGEOUT;
- case 3:
- return DAMOS_HUGEPAGE;
- case 4:
- return DAMOS_NOHUGEPAGE;
- case 5:
- return DAMOS_STAT;
- default:
- return -EINVAL;
- }
-}
-
-/*
- * Converts a string into an array of struct damos pointers
- *
- * Returns an array of struct damos pointers that converted if the conversion
- * success, or NULL otherwise.
- */
-static struct damos **str_to_schemes(const char *str, ssize_t len,
- ssize_t *nr_schemes)
-{
- struct damos *scheme, **schemes;
- const int max_nr_schemes = 256;
- int pos = 0, parsed, ret;
- unsigned int action_input;
- enum damos_action action;
-
- schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
- GFP_KERNEL);
- if (!schemes)
- return NULL;
-
- *nr_schemes = 0;
- while (pos < len && *nr_schemes < max_nr_schemes) {
- struct damos_access_pattern pattern = {};
- struct damos_quota quota = {};
- struct damos_watermarks wmarks;
-
- ret = sscanf(&str[pos],
- "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
- &pattern.min_sz_region, &pattern.max_sz_region,
- &pattern.min_nr_accesses,
- &pattern.max_nr_accesses,
- &pattern.min_age_region,
- &pattern.max_age_region,
- &action_input, &quota.ms,
- &quota.sz, &quota.reset_interval,
- &quota.weight_sz, &quota.weight_nr_accesses,
- &quota.weight_age, &wmarks.metric,
- &wmarks.interval, &wmarks.high, &wmarks.mid,
- &wmarks.low, &parsed);
- if (ret != 18)
- break;
- action = dbgfs_scheme_action_to_damos_action(action_input);
- if ((int)action < 0)
- goto fail;
-
- if (pattern.min_sz_region > pattern.max_sz_region ||
- pattern.min_nr_accesses > pattern.max_nr_accesses ||
- pattern.min_age_region > pattern.max_age_region)
- goto fail;
-
- if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
- wmarks.mid < wmarks.low)
- goto fail;
-
- pos += parsed;
- scheme = damon_new_scheme(&pattern, action, 0, &quota,
- &wmarks, NUMA_NO_NODE);
- if (!scheme)
- goto fail;
-
- schemes[*nr_schemes] = scheme;
- *nr_schemes += 1;
- }
- return schemes;
-fail:
- free_schemes_arr(schemes, *nr_schemes);
- return NULL;
-}
-
-static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- struct damos **schemes;
- ssize_t nr_schemes = 0, ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- schemes = str_to_schemes(kbuf, count, &nr_schemes);
- if (!schemes) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- damon_set_schemes(ctx, schemes, nr_schemes);
- ret = count;
- nr_schemes = 0;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- free_schemes_arr(schemes, nr_schemes);
-out:
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
-{
- struct damon_target *t;
- int id;
- int written = 0;
- int rc;
-
- damon_for_each_target(t, ctx) {
- if (damon_target_has_pid(ctx))
- /* Show pid numbers to debugfs users */
- id = pid_vnr(t->pid);
- else
- /* Show 42 for physical address space, just for fun */
- id = 42;
-
- rc = scnprintf(&buf[written], len - written, "%d ", id);
- if (!rc)
- return -ENOMEM;
- written += rc;
- }
- if (written)
- written -= 1;
- written += scnprintf(&buf[written], len - written, "\n");
- return written;
-}
-
-static ssize_t dbgfs_target_ids_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- ssize_t len;
- char ids_buf[320];
-
- mutex_lock(&ctx->kdamond_lock);
- len = sprint_target_ids(ctx, ids_buf, 320);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- return len;
-
- return simple_read_from_buffer(buf, count, ppos, ids_buf, len);
-}
-
-/*
- * Converts a string into an integers array
- *
- * Returns an array of integers array if the conversion success, or NULL
- * otherwise.
- */
-static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
-{
- int *array;
- const int max_nr_ints = 32;
- int nr;
- int pos = 0, parsed, ret;
-
- *nr_ints = 0;
- array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
- if (!array)
- return NULL;
- while (*nr_ints < max_nr_ints && pos < len) {
- ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
- pos += parsed;
- if (ret != 1)
- break;
- array[*nr_ints] = nr;
- *nr_ints += 1;
- }
-
- return array;
-}
-
-static void dbgfs_put_pids(struct pid **pids, int nr_pids)
-{
- int i;
-
- for (i = 0; i < nr_pids; i++)
- put_pid(pids[i]);
-}
-
-/*
- * Converts a string into an struct pid pointers array
- *
- * Returns an array of struct pid pointers if the conversion success, or NULL
- * otherwise.
- */
-static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
-{
- int *ints;
- ssize_t nr_ints;
- struct pid **pids;
-
- *nr_pids = 0;
-
- ints = str_to_ints(str, len, &nr_ints);
- if (!ints)
- return NULL;
-
- pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
- if (!pids)
- goto out;
-
- for (; *nr_pids < nr_ints; (*nr_pids)++) {
- pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
- if (!pids[*nr_pids]) {
- dbgfs_put_pids(pids, *nr_pids);
- kfree(ints);
- kfree(pids);
- return NULL;
- }
- }
-
-out:
- kfree(ints);
- return pids;
-}
-
-/*
- * dbgfs_set_targets() - Set monitoring targets.
- * @ctx: monitoring context
- * @nr_targets: number of targets
- * @pids: array of target pids (size is same to @nr_targets)
- *
- * This function should not be called while the kdamond is running. @pids is
- * ignored if the context is not configured to have pid in each target. On
- * failure, reference counts of all pids in @pids are decremented.
- *
- * Return: 0 on success, negative error code otherwise.
- */
-static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
- struct pid **pids)
-{
- ssize_t i;
- struct damon_target *t, *next;
-
- damon_for_each_target_safe(t, next, ctx) {
- if (damon_target_has_pid(ctx))
- put_pid(t->pid);
- damon_destroy_target(t);
- }
-
- for (i = 0; i < nr_targets; i++) {
- t = damon_new_target();
- if (!t) {
- damon_for_each_target_safe(t, next, ctx)
- damon_destroy_target(t);
- if (damon_target_has_pid(ctx))
- dbgfs_put_pids(pids, nr_targets);
- return -ENOMEM;
- }
- if (damon_target_has_pid(ctx))
- t->pid = pids[i];
- damon_add_target(ctx, t);
- }
-
- return 0;
-}
-
-static ssize_t dbgfs_target_ids_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- bool id_is_pid = true;
- char *kbuf;
- struct pid **target_pids = NULL;
- ssize_t nr_targets;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- if (!strncmp(kbuf, "paddr\n", count)) {
- id_is_pid = false;
- nr_targets = 1;
- }
-
- if (id_is_pid) {
- target_pids = str_to_pids(kbuf, count, &nr_targets);
- if (!target_pids) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- if (id_is_pid)
- dbgfs_put_pids(target_pids, nr_targets);
- ret = -EBUSY;
- goto unlock_out;
- }
-
- /* remove previously set targets */
- dbgfs_set_targets(ctx, 0, NULL);
- if (!nr_targets) {
- ret = count;
- goto unlock_out;
- }
-
- /* Configure the context for the address space type */
- if (id_is_pid)
- ret = damon_select_ops(ctx, DAMON_OPS_VADDR);
- else
- ret = damon_select_ops(ctx, DAMON_OPS_PADDR);
- if (ret)
- goto unlock_out;
-
- ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
- if (!ret)
- ret = count;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- kfree(target_pids);
-out:
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
-{
- struct damon_target *t;
- struct damon_region *r;
- int target_idx = 0;
- int written = 0;
- int rc;
-
- damon_for_each_target(t, c) {
- damon_for_each_region(r, t) {
- rc = scnprintf(&buf[written], len - written,
- "%d %lu %lu\n",
- target_idx, r->ar.start, r->ar.end);
- if (!rc)
- return -ENOMEM;
- written += rc;
- }
- target_idx++;
- }
- return written;
-}
-
-static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- mutex_unlock(&ctx->kdamond_lock);
- len = -EBUSY;
- goto out;
- }
-
- len = sprint_init_regions(ctx, kbuf, count);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static int add_init_region(struct damon_ctx *c, int target_idx,
- struct damon_addr_range *ar)
-{
- struct damon_target *t;
- struct damon_region *r, *prev;
- unsigned long idx = 0;
- int rc = -EINVAL;
-
- if (ar->start >= ar->end)
- return -EINVAL;
-
- damon_for_each_target(t, c) {
- if (idx++ == target_idx) {
- r = damon_new_region(ar->start, ar->end);
- if (!r)
- return -ENOMEM;
- damon_add_region(r, t);
- if (damon_nr_regions(t) > 1) {
- prev = damon_prev_region(r);
- if (prev->ar.end > r->ar.start) {
- damon_destroy_region(r, t);
- return -EINVAL;
- }
- }
- rc = 0;
- }
- }
- return rc;
-}
-
-static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
-{
- struct damon_target *t;
- struct damon_region *r, *next;
- int pos = 0, parsed, ret;
- int target_idx;
- struct damon_addr_range ar;
- int err;
-
- damon_for_each_target(t, c) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
-
- while (pos < len) {
- ret = sscanf(&str[pos], "%d %lu %lu%n",
- &target_idx, &ar.start, &ar.end, &parsed);
- if (ret != 3)
- break;
- err = add_init_region(c, target_idx, &ar);
- if (err)
- goto fail;
- pos += parsed;
- }
-
- return 0;
-
-fail:
- damon_for_each_target(t, c) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
- return err;
-}
-
-static ssize_t dbgfs_init_regions_write(struct file *file,
- const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t ret = count;
- int err;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- err = set_init_regions(ctx, kbuf, ret);
- if (err)
- ret = err;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t dbgfs_kdamond_pid_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond)
- len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid);
- else
- len = scnprintf(kbuf, count, "none\n");
- mutex_unlock(&ctx->kdamond_lock);
- if (!len)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static int damon_dbgfs_open(struct inode *inode, struct file *file)
-{
- damon_dbgfs_warn_deprecation();
-
- file->private_data = inode->i_private;
-
- return nonseekable_open(inode, file);
-}
-
-static const struct file_operations attrs_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_attrs_read,
- .write = dbgfs_attrs_write,
-};
-
-static const struct file_operations schemes_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_schemes_read,
- .write = dbgfs_schemes_write,
-};
-
-static const struct file_operations target_ids_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_target_ids_read,
- .write = dbgfs_target_ids_write,
-};
-
-static const struct file_operations init_regions_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_init_regions_read,
- .write = dbgfs_init_regions_write,
-};
-
-static const struct file_operations kdamond_pid_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_kdamond_pid_read,
-};
-
-static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
-{
- const char * const file_names[] = {"attrs", "schemes", "target_ids",
- "init_regions", "kdamond_pid"};
- const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
- &target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
- int i;
-
- for (i = 0; i < ARRAY_SIZE(file_names); i++)
- debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
-}
-
-static void dbgfs_before_terminate(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
-
- if (!damon_target_has_pid(ctx))
- return;
-
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_target_safe(t, next, ctx) {
- put_pid(t->pid);
- damon_destroy_target(t);
- }
- mutex_unlock(&ctx->kdamond_lock);
-}
-
-static struct damon_ctx *dbgfs_new_ctx(void)
-{
- struct damon_ctx *ctx;
-
- ctx = damon_new_ctx();
- if (!ctx)
- return NULL;
-
- if (damon_select_ops(ctx, DAMON_OPS_VADDR) &&
- damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return NULL;
- }
- ctx->callback.before_terminate = dbgfs_before_terminate;
- return ctx;
-}
-
-static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
-{
- damon_destroy_ctx(ctx);
-}
-
-static ssize_t damon_dbgfs_deprecated_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
-
- return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
-}
-
-/*
- * Make a context of @name and create a debugfs directory for it.
- *
- * This function should be called while holding damon_dbgfs_lock.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int dbgfs_mk_context(char *name)
-{
- struct dentry *root, **new_dirs, *new_dir;
- struct damon_ctx **new_ctxs, *new_ctx;
-
- if (damon_nr_running_ctxs())
- return -EBUSY;
-
- new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) *
- (dbgfs_nr_ctxs + 1), GFP_KERNEL);
- if (!new_ctxs)
- return -ENOMEM;
- dbgfs_ctxs = new_ctxs;
-
- new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) *
- (dbgfs_nr_ctxs + 1), GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
- dbgfs_dirs = new_dirs;
-
- root = dbgfs_dirs[0];
- if (!root)
- return -ENOENT;
-
- new_dir = debugfs_create_dir(name, root);
- /* Below check is required for a potential duplicated name case */
- if (IS_ERR(new_dir))
- return PTR_ERR(new_dir);
- dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
-
- new_ctx = dbgfs_new_ctx();
- if (!new_ctx) {
- debugfs_remove(new_dir);
- dbgfs_dirs[dbgfs_nr_ctxs] = NULL;
- return -ENOMEM;
- }
-
- dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx;
- dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs],
- dbgfs_ctxs[dbgfs_nr_ctxs]);
- dbgfs_nr_ctxs++;
-
- return 0;
-}
-
-static ssize_t dbgfs_mk_context_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- char *ctx_name;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- ctx_name = kmalloc(count + 1, GFP_KERNEL);
- if (!ctx_name) {
- kfree(kbuf);
- return -ENOMEM;
- }
-
- /* Trim white space */
- if (sscanf(kbuf, "%s", ctx_name) != 1) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- ret = dbgfs_mk_context(ctx_name);
- if (!ret)
- ret = count;
- mutex_unlock(&damon_dbgfs_lock);
-
-out:
- kfree(kbuf);
- kfree(ctx_name);
- return ret;
-}
-
-/*
- * Remove a context of @name and its debugfs directory.
- *
- * This function should be called while holding damon_dbgfs_lock.
- *
- * Return 0 on success, negative error code otherwise.
- */
-static int dbgfs_rm_context(char *name)
-{
- struct dentry *root, *dir, **new_dirs;
- struct inode *inode;
- struct damon_ctx **new_ctxs;
- int i, j;
- int ret = 0;
-
- if (damon_nr_running_ctxs())
- return -EBUSY;
-
- root = dbgfs_dirs[0];
- if (!root)
- return -ENOENT;
-
- dir = debugfs_lookup(name, root);
- if (!dir)
- return -ENOENT;
-
- inode = d_inode(dir);
- if (!S_ISDIR(inode->i_mode)) {
- ret = -EINVAL;
- goto out_dput;
- }
-
- new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
- GFP_KERNEL);
- if (!new_dirs) {
- ret = -ENOMEM;
- goto out_dput;
- }
-
- new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
- GFP_KERNEL);
- if (!new_ctxs) {
- ret = -ENOMEM;
- goto out_new_dirs;
- }
-
- for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
- if (dbgfs_dirs[i] == dir) {
- debugfs_remove(dbgfs_dirs[i]);
- dbgfs_destroy_ctx(dbgfs_ctxs[i]);
- continue;
- }
- new_dirs[j] = dbgfs_dirs[i];
- new_ctxs[j++] = dbgfs_ctxs[i];
- }
-
- kfree(dbgfs_dirs);
- kfree(dbgfs_ctxs);
-
- dbgfs_dirs = new_dirs;
- dbgfs_ctxs = new_ctxs;
- dbgfs_nr_ctxs--;
-
- goto out_dput;
-
-out_new_dirs:
- kfree(new_dirs);
-out_dput:
- dput(dir);
- return ret;
-}
-
-static ssize_t dbgfs_rm_context_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- ssize_t ret;
- char *ctx_name;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- ctx_name = kmalloc(count + 1, GFP_KERNEL);
- if (!ctx_name) {
- kfree(kbuf);
- return -ENOMEM;
- }
-
- /* Trim white space */
- if (sscanf(kbuf, "%s", ctx_name) != 1) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- ret = dbgfs_rm_context(ctx_name);
- if (!ret)
- ret = count;
- mutex_unlock(&damon_dbgfs_lock);
-
-out:
- kfree(kbuf);
- kfree(ctx_name);
- return ret;
-}
-
-static ssize_t dbgfs_monitor_on_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- char monitor_on_buf[5];
- bool monitor_on = damon_nr_running_ctxs() != 0;
- int len;
-
- len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n");
-
- return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len);
-}
-
-static ssize_t dbgfs_monitor_on_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- ssize_t ret;
- char *kbuf;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- /* Remove white space */
- if (sscanf(kbuf, "%s", kbuf) != 1) {
- kfree(kbuf);
- return -EINVAL;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- if (!strncmp(kbuf, "on", count)) {
- int i;
-
- for (i = 0; i < dbgfs_nr_ctxs; i++) {
- if (damon_targets_empty(dbgfs_ctxs[i])) {
- kfree(kbuf);
- mutex_unlock(&damon_dbgfs_lock);
- return -EINVAL;
- }
- }
- ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true);
- } else if (!strncmp(kbuf, "off", count)) {
- ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
- } else {
- ret = -EINVAL;
- }
- mutex_unlock(&damon_dbgfs_lock);
-
- if (!ret)
- ret = count;
- kfree(kbuf);
- return ret;
-}
-
-static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
-{
- damon_dbgfs_warn_deprecation();
- return nonseekable_open(inode, file);
-}
-
-static const struct file_operations deprecated_fops = {
- .read = damon_dbgfs_deprecated_read,
-};
-
-static const struct file_operations mk_contexts_fops = {
- .open = damon_dbgfs_static_file_open,
- .write = dbgfs_mk_context_write,
-};
-
-static const struct file_operations rm_contexts_fops = {
- .open = damon_dbgfs_static_file_open,
- .write = dbgfs_rm_context_write,
-};
-
-static const struct file_operations monitor_on_fops = {
- .open = damon_dbgfs_static_file_open,
- .read = dbgfs_monitor_on_read,
- .write = dbgfs_monitor_on_write,
-};
-
-static int __init __damon_dbgfs_init(void)
-{
- struct dentry *dbgfs_root;
- const char * const file_names[] = {"mk_contexts", "rm_contexts",
- "monitor_on_DEPRECATED", "DEPRECATED"};
- const struct file_operations *fops[] = {&mk_contexts_fops,
- &rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
- int i;
-
- dbgfs_root = debugfs_create_dir("damon", NULL);
-
- for (i = 0; i < ARRAY_SIZE(file_names); i++)
- debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL,
- fops[i]);
- dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
-
- dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
- if (!dbgfs_dirs) {
- debugfs_remove(dbgfs_root);
- return -ENOMEM;
- }
- dbgfs_dirs[0] = dbgfs_root;
-
- return 0;
-}
-
-/*
- * Functions for the initialization
- */
-
-static int __init damon_dbgfs_init(void)
-{
- int rc = -ENOMEM;
-
- mutex_lock(&damon_dbgfs_lock);
- dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL);
- if (!dbgfs_ctxs)
- goto out;
- dbgfs_ctxs[0] = dbgfs_new_ctx();
- if (!dbgfs_ctxs[0]) {
- kfree(dbgfs_ctxs);
- goto out;
- }
- dbgfs_nr_ctxs = 1;
-
- rc = __damon_dbgfs_init();
- if (rc) {
- kfree(dbgfs_ctxs[0]);
- kfree(dbgfs_ctxs);
- pr_err("%s: dbgfs init failed\n", __func__);
- }
-
-out:
- mutex_unlock(&damon_dbgfs_lock);
- return rc;
-}
-
-module_init(damon_dbgfs_init);
-
-#include "tests/dbgfs-kunit.h"
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
index 7cf96574cde7..86d58f8c4f63 100644
--- a/mm/damon/modules-common.c
+++ b/mm/damon/modules-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index f49cdb417005..f103ad556368 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index d25d99cb5f2b..b43620fee6bb 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -9,6 +9,8 @@
#include <linux/page_idle.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include "ops-common.h"
@@ -24,7 +26,7 @@ struct folio *damon_get_folio(unsigned long pfn)
struct page *page = pfn_to_online_page(pfn);
struct folio *folio;
- if (!page || PageTail(page))
+ if (!page)
return NULL;
folio = page_folio(page);
@@ -39,12 +41,29 @@ struct folio *damon_get_folio(unsigned long pfn)
void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
{
- struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte)));
+ pte_t pteval = ptep_get(pte);
+ struct folio *folio;
+ bool young = false;
+ unsigned long pfn;
+
+ if (likely(pte_present(pteval)))
+ pfn = pte_pfn(pteval);
+ else
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ folio = damon_get_folio(pfn);
if (!folio)
return;
- if (ptep_clear_young_notify(vma, addr, pte))
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that actually map pages
+ * are "old" from a CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (likely(pte_present(pteval)))
+ young |= ptep_test_and_clear_young(vma, addr, pte);
+ young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
+ if (young)
folio_set_young(folio);
folio_set_idle(folio);
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 18d837d11bce..cc9f5da9c012 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a9ff35341d65..4102a8c5f992 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for The Physical Address Space
+ * DAMON Code for The Physical Address Space
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -92,12 +92,20 @@ static bool damon_folio_young_one(struct folio *folio,
{
bool *accessed = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+ pte_t pte;
*accessed = false;
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- *accessed = pte_young(ptep_get(pvmw.pte)) ||
+ pte = ptep_get(pvmw.pte);
+
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
+ */
+ *accessed = (pte_present(pte) && pte_young(pte)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
@@ -198,16 +206,20 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static bool __damos_pa_filter_out(struct damos_filter *filter,
+static bool damos_pa_filter_match(struct damos_filter *filter,
struct folio *folio)
{
bool matched = false;
struct mem_cgroup *memcg;
+ size_t folio_sz;
switch (filter->type) {
case DAMOS_FILTER_TYPE_ANON:
matched = folio_test_anon(folio);
break;
+ case DAMOS_FILTER_TYPE_ACTIVE:
+ matched = folio_test_active(folio);
+ break;
case DAMOS_FILTER_TYPE_MEMCG:
rcu_read_lock();
memcg = folio_memcg_check(folio);
@@ -222,6 +234,14 @@ static bool __damos_pa_filter_out(struct damos_filter *filter,
if (matched)
damon_folio_mkold(folio);
break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ folio_sz = folio_size(folio);
+ matched = filter->sz_range.min <= folio_sz &&
+ folio_sz <= filter->sz_range.max;
+ break;
+ case DAMOS_FILTER_TYPE_UNMAPPED:
+ matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
+ break;
default:
break;
}
@@ -236,42 +256,63 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
{
struct damos_filter *filter;
- damos_for_each_filter(filter, scheme) {
- if (__damos_pa_filter_out(filter, folio))
- return true;
+ if (scheme->core_filters_allowed)
+ return false;
+
+ damos_for_each_ops_filter(filter, scheme) {
+ if (damos_pa_filter_match(filter, folio))
+ return !filter->allow;
+ }
+ return scheme->ops_filters_default_reject;
+}
+
+static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
+{
+ if (!folio)
+ return true;
+ if (folio == s->last_applied) {
+ folio_put(folio);
+ return true;
}
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied;
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
+ struct folio *folio;
/* check access in page level again by default */
- damos_for_each_filter(filter, s) {
+ damos_for_each_ops_filter(filter, s) {
if (filter->type == DAMOS_FILTER_TYPE_YOUNG) {
install_young_filter = false;
break;
}
}
if (install_young_filter) {
- filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true);
+ filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_YOUNG, true, false);
if (!filter)
return 0;
damos_add_filter(s, filter);
}
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -282,28 +323,36 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
else
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
if (install_young_filter)
damos_destroy_filter(filter);
applied = reclaim_pages(&folio_list);
cond_resched();
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed)
+ struct damon_region *r, struct damos *s, bool mark_accessed,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied = 0;
+ struct folio *folio;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
if (mark_accessed)
folio_mark_accessed(folio);
@@ -311,21 +360,25 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
folio_deactivate(folio);
applied += folio_nr_pages(folio);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s)
+ struct damos *s, unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true);
+ return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s)
+ struct damos *s, unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, false);
+ return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ sz_filter_passed);
}
static unsigned int __damon_pa_migrate_folio_list(
@@ -449,48 +502,90 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
return nr_migrated;
}
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s)
+static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied;
LIST_HEAD(folio_list);
+ struct folio *folio;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
if (!folio_isolate_lru(folio))
goto put_folio;
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
cond_resched();
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
+static bool damon_pa_scheme_has_filter(struct damos *s)
+{
+ struct damos_filter *f;
+
+ damos_for_each_ops_filter(f, s)
+ return true;
+ return false;
+}
+
+static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ unsigned long addr;
+ struct folio *folio;
+
+ if (!damon_pa_scheme_has_filter(s))
+ return 0;
+
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ if (!damos_pa_filter_out(s, folio))
+ *sz_filter_passed += folio_size(folio);
+ addr += folio_size(folio);
+ folio_put(folio);
+ }
+ s->last_applied = folio;
+ return 0;
+}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+ struct damos *scheme, unsigned long *sz_filter_passed)
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme);
+ return damon_pa_pageout(r, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme);
+ return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme);
+ return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme);
+ return damon_pa_migrate(r, scheme, sz_filter_passed);
case DAMOS_STAT:
- break;
+ return damon_pa_stat(r, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
@@ -528,7 +623,6 @@ static int __init damon_pa_initcall(void)
.update = NULL,
.prepare_access_checks = damon_pa_prepare_access_checks,
.check_accesses = damon_pa_check_accesses,
- .reset_aggregated = NULL,
.target_valid = NULL,
.cleanup = NULL,
.apply_scheme = damon_pa_apply_scheme,
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 9e0077a9404e..a675150965e0 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -221,7 +221,7 @@ static int damon_reclaim_apply_parameters(void)
}
if (skip_anon) {
- filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
if (!filter)
goto out;
damos_add_filter(scheme, filter);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 70edf45c2174..ffaf285e241a 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 9a18f3c535d3..2099adee11d0 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
-int damon_sysfs_schemes_update_regions_start(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx, bool total_bytes_only);
-
-void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
-
-bool damos_sysfs_regions_upd_done(void);
-
-int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s,
+ bool total_bytes_only, unsigned long sz_filter_passed);
int damon_sysfs_schemes_clear_regions(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx);
+ struct damon_sysfs_schemes *sysfs_schemes);
int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index b095457380b5..30ae7518ffbf 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region {
struct damon_addr_range ar;
unsigned int nr_accesses;
unsigned int age;
+ unsigned long sz_filter_passed;
struct list_head list;
};
@@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
return sysfs_emit(buf, "%u\n", region->age);
}
+static ssize_t sz_filter_passed_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->sz_filter_passed);
+}
+
static void damon_sysfs_scheme_region_release(struct kobject *kobj)
{
struct damon_sysfs_scheme_region *region = container_of(kobj,
@@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
__ATTR_RO_MODE(age, 0400);
+static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr =
+ __ATTR_RO_MODE(sz_filter_passed, 0400);
+
static struct attribute *damon_sysfs_scheme_region_attrs[] = {
&damon_sysfs_scheme_region_start_attr.attr,
&damon_sysfs_scheme_region_end_attr.attr,
&damon_sysfs_scheme_region_nr_accesses_attr.attr,
&damon_sysfs_scheme_region_age_attr.attr,
+ &damon_sysfs_scheme_region_sz_filter_passed_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
@@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
* scheme regions directory
*/
-/*
- * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update
- * status
- * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request.
- * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started.
- * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished.
- *
- * Each DAMON-based operation scheme (&struct damos) has its own apply
- * interval, and we need to expose the scheme tried regions based on only
- * single snapshot. For this, we keep the tried regions update status for each
- * scheme. The status becomes 'idle' at the beginning.
- *
- * Once the tried regions update request is received, the request handling
- * start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply()
- * callback.
- *
- * Then, the first followup ->before_damos_apply() callback
- * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
- * ->after_sampling() or ->after_aggregation() callback
- * (damon_sysfs_cmd_request_callback()) after the call is called only after
- * the scheme is completely applied to the given snapshot. Hence the callback
- * knows the situation by showing 'started' status, and sets the status as
- * 'finished'. Then, damon_sysfs_before_damos_apply() understands the
- * situation by showing the 'finished' status and do nothing.
- *
- * If DAMOS is not applied to any region due to any reasons including the
- * access pattern, the watermarks, the quotas, and the filters,
- * ->before_damos_apply() will not be called back. Until the situation is
- * changed, the update will not be finished. To avoid this,
- * damon_sysfs_after_sampling() set the status as 'finished' if more than two
- * apply intervals of the scheme is passed while the state is 'idle'.
- *
- * Finally, the tried regions request handling finisher function
- * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks.
- */
-enum damos_sysfs_regions_upd_status {
- DAMOS_TRIED_REGIONS_UPD_IDLE,
- DAMOS_TRIED_REGIONS_UPD_STARTED,
- DAMOS_TRIED_REGIONS_UPD_FINISHED,
-};
-
struct damon_sysfs_scheme_regions {
struct kobject kobj;
struct list_head regions_list;
int nr_regions;
unsigned long total_bytes;
- enum damos_sysfs_regions_upd_status upd_status;
- unsigned long upd_timeout_jiffies;
};
static struct damon_sysfs_scheme_regions *
@@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void)
INIT_LIST_HEAD(&regions->regions_list);
regions->nr_regions = 0;
regions->total_bytes = 0;
- regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
return regions;
}
@@ -233,6 +202,7 @@ struct damon_sysfs_stats {
unsigned long sz_tried;
unsigned long nr_applied;
unsigned long sz_applied;
+ unsigned long sz_ops_filter_passed;
unsigned long qt_exceeds;
};
@@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj,
return sysfs_emit(buf, "%lu\n", stats->sz_applied);
}
+static ssize_t sz_ops_filter_passed_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed);
+}
+
static ssize_t qt_exceeds_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
__ATTR_RO_MODE(sz_applied, 0400);
+static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr =
+ __ATTR_RO_MODE(sz_ops_filter_passed, 0400);
+
static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
__ATTR_RO_MODE(qt_exceeds, 0400);
@@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = {
&damon_sysfs_stats_sz_tried_attr.attr,
&damon_sysfs_stats_nr_applied_attr.attr,
&damon_sysfs_stats_sz_applied_attr.attr,
+ &damon_sysfs_stats_sz_ops_filter_passed_attr.attr,
&damon_sysfs_stats_qt_exceeds_attr.attr,
NULL,
};
@@ -326,25 +309,46 @@ static const struct kobj_type damon_sysfs_stats_ktype = {
* filter directory
*/
+/*
+ * enum damos_sysfs_filter_handle_layer - Layers handling filters of a dir.
+ */
+enum damos_sysfs_filter_handle_layer {
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH,
+};
+
struct damon_sysfs_scheme_filter {
struct kobject kobj;
+ enum damos_sysfs_filter_handle_layer handle_layer;
enum damos_filter_type type;
bool matching;
+ bool allow;
char *memcg_path;
struct damon_addr_range addr_range;
+ struct damon_size_range sz_range;
int target_idx;
};
-static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(
+ enum damos_sysfs_filter_handle_layer layer)
{
- return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+ struct damon_sysfs_scheme_filter *filter;
+
+ filter = kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+ if (filter)
+ filter->handle_layer = layer;
+ return filter;
}
/* Should match with enum damos_filter_type */
static const char * const damon_sysfs_scheme_filter_type_strs[] = {
"anon",
+ "active",
"memcg",
"young",
+ "hugepage_size",
+ "unmapped",
"addr",
"target",
};
@@ -359,6 +363,23 @@ static ssize_t type_show(struct kobject *kobj,
damon_sysfs_scheme_filter_type_strs[filter->type]);
}
+static bool damos_sysfs_scheme_filter_valid_type(
+ enum damos_sysfs_filter_handle_layer layer,
+ enum damos_filter_type type)
+{
+ switch (layer) {
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH:
+ return true;
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE:
+ return !damos_filter_for_ops(type);
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS:
+ return damos_filter_for_ops(type);
+ default:
+ break;
+ }
+ return false;
+}
+
static ssize_t type_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
@@ -370,6 +391,9 @@ static ssize_t type_store(struct kobject *kobj,
for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
type])) {
+ if (!damos_sysfs_scheme_filter_valid_type(
+ filter->handle_layer, type))
+ break;
filter->type = type;
ret = count;
break;
@@ -402,6 +426,30 @@ static ssize_t matching_store(struct kobject *kobj,
return count;
}
+static ssize_t allow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N');
+}
+
+static ssize_t allow_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool allow;
+ int err = kstrtobool(buf, &allow);
+
+ if (err)
+ return err;
+
+ filter->allow = allow;
+ return count;
+}
+
static ssize_t memcg_path_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -417,12 +465,14 @@ static ssize_t memcg_path_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+ char *path = kmalloc_array(size_add(count, 1), sizeof(*path),
+ GFP_KERNEL);
if (!path)
return -ENOMEM;
strscpy(path, buf, count + 1);
+ kfree(filter->memcg_path);
filter->memcg_path = path;
return count;
}
@@ -465,6 +515,44 @@ static ssize_t addr_end_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t min_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->sz_range.min);
+}
+
+static ssize_t min_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->sz_range.min);
+
+ return err ? err : count;
+}
+
+static ssize_t max_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->sz_range.max);
+}
+
+static ssize_t max_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->sz_range.max);
+
+ return err ? err : count;
+}
+
static ssize_t damon_target_idx_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -499,6 +587,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
__ATTR_RW_MODE(matching, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr =
+ __ATTR_RW_MODE(allow, 0600);
+
static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
__ATTR_RW_MODE(memcg_path, 0600);
@@ -508,15 +599,24 @@ static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr =
__ATTR_RW_MODE(addr_end, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr =
__ATTR_RW_MODE(damon_target_idx, 0600);
static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
&damon_sysfs_scheme_filter_type_attr.attr,
&damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_allow_attr.attr,
&damon_sysfs_scheme_filter_memcg_path_attr.attr,
&damon_sysfs_scheme_filter_addr_start_attr.attr,
&damon_sysfs_scheme_filter_addr_end_attr.attr,
+ &damon_sysfs_scheme_filter_min_attr.attr,
+ &damon_sysfs_scheme_filter_max_attr.attr,
&damon_sysfs_scheme_filter_damon_target_idx_attr.attr,
NULL,
};
@@ -534,14 +634,20 @@ static const struct kobj_type damon_sysfs_scheme_filter_ktype = {
struct damon_sysfs_scheme_filters {
struct kobject kobj;
+ enum damos_sysfs_filter_handle_layer handle_layer;
struct damon_sysfs_scheme_filter **filters_arr;
int nr;
};
static struct damon_sysfs_scheme_filters *
-damon_sysfs_scheme_filters_alloc(void)
+damon_sysfs_scheme_filters_alloc(enum damos_sysfs_filter_handle_layer layer)
{
- return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+ struct damon_sysfs_scheme_filters *filters;
+
+ filters = kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+ if (filters)
+ filters->handle_layer = layer;
+ return filters;
}
static void damon_sysfs_scheme_filters_rm_dirs(
@@ -574,7 +680,8 @@ static int damon_sysfs_scheme_filters_add_dirs(
filters->filters_arr = filters_arr;
for (i = 0; i < nr_filters; i++) {
- filter = damon_sysfs_scheme_filter_alloc();
+ filter = damon_sysfs_scheme_filter_alloc(
+ filters->handle_layer);
if (!filter) {
damon_sysfs_scheme_filters_rm_dirs(filters);
return -ENOMEM;
@@ -831,12 +938,15 @@ struct damos_sysfs_quota_goal {
enum damos_quota_goal_metric metric;
unsigned long target_value;
unsigned long current_value;
+ int nid;
};
-/* This should match with enum damos_action */
+/* This should match with enum damos_quota_goal_metric */
static const char * const damos_sysfs_quota_goal_metric_strs[] = {
"user_input",
"some_mem_psi_us",
+ "node_mem_used_bp",
+ "node_mem_free_bp",
};
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
@@ -909,6 +1019,28 @@ static ssize_t current_value_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t nid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+
+ /* todo: return error if the goal is not using nid */
+
+ return sysfs_emit(buf, "%d\n", goal->nid);
+}
+
+static ssize_t nid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+ int err = kstrtoint(buf, 0, &goal->nid);
+
+ /* feed callback should check existence of this file and read value */
+ return err ? err : count;
+}
+
static void damos_sysfs_quota_goal_release(struct kobject *kobj)
{
/* or, notify this release to the feed callback */
@@ -924,10 +1056,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
__ATTR_RW_MODE(current_value, 0600);
+static struct kobj_attribute damos_sysfs_quota_goal_nid_attr =
+ __ATTR_RW_MODE(nid, 0600);
+
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
&damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
+ &damos_sysfs_quota_goal_nid_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damos_sysfs_quota_goal);
@@ -1367,7 +1503,7 @@ static int damon_sysfs_access_pattern_add_range_dir(
if (!range)
return -ENOMEM;
err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
- &access_pattern->kobj, name);
+ &access_pattern->kobj, "%s", name);
if (err)
kobject_put(&range->kobj);
else
@@ -1443,6 +1579,8 @@ struct damon_sysfs_scheme {
unsigned long apply_interval_us;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *core_filters;
+ struct damon_sysfs_scheme_filters *ops_filters;
struct damon_sysfs_scheme_filters *filters;
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
@@ -1543,21 +1681,53 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
return err;
}
-static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme,
+ enum damos_sysfs_filter_handle_layer layer, const char *name,
+ struct damon_sysfs_scheme_filters **filters_ptr)
{
struct damon_sysfs_scheme_filters *filters =
- damon_sysfs_scheme_filters_alloc();
+ damon_sysfs_scheme_filters_alloc(layer);
int err;
if (!filters)
return -ENOMEM;
err = kobject_init_and_add(&filters->kobj,
&damon_sysfs_scheme_filters_ktype, &scheme->kobj,
- "filters");
+ "%s", name);
if (err)
kobject_put(&filters->kobj);
else
- scheme->filters = filters;
+ *filters_ptr = filters;
+ return err;
+}
+
+static int damos_sysfs_set_filter_dirs(struct damon_sysfs_scheme *scheme)
+{
+ int err;
+
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, "filters",
+ &scheme->filters);
+ if (err)
+ return err;
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, "core_filters",
+ &scheme->core_filters);
+ if (err)
+ goto put_filters_out;
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, "ops_filters",
+ &scheme->ops_filters);
+ if (err)
+ goto put_core_filters_out;
+ return 0;
+
+put_core_filters_out:
+ kobject_put(&scheme->core_filters->kobj);
+ scheme->core_filters = NULL;
+put_filters_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
return err;
}
@@ -1609,7 +1779,7 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_filters(scheme);
+ err = damos_sysfs_set_filter_dirs(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
err = damon_sysfs_scheme_set_stats(scheme);
@@ -1624,6 +1794,10 @@ put_tried_regions_out:
kobject_put(&scheme->tried_regions->kobj);
scheme->tried_regions = NULL;
put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->ops_filters->kobj);
+ scheme->ops_filters = NULL;
+ kobject_put(&scheme->core_filters->kobj);
+ scheme->core_filters = NULL;
kobject_put(&scheme->filters->kobj);
scheme->filters = NULL;
put_watermarks_quotas_access_pattern_out:
@@ -1647,6 +1821,10 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
kobject_put(&scheme->watermarks->kobj);
damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
kobject_put(&scheme->filters->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->core_filters);
+ kobject_put(&scheme->core_filters->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->ops_filters);
+ kobject_put(&scheme->ops_filters->kobj);
kobject_put(&scheme->stats->kobj);
damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
kobject_put(&scheme->tried_regions->kobj);
@@ -1888,7 +2066,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
if (!memcg_path)
return -EINVAL;
- path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
if (!path)
return -ENOMEM;
@@ -1918,7 +2096,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme,
sysfs_filters->filters_arr[i];
struct damos_filter *filter =
damos_new_filter(sysfs_filter->type,
- sysfs_filter->matching);
+ sysfs_filter->matching,
+ sysfs_filter->allow);
int err;
if (!filter)
@@ -1940,6 +2119,13 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme,
filter->addr_range = sysfs_filter->addr_range;
} else if (filter->type == DAMOS_FILTER_TYPE_TARGET) {
filter->target_idx = sysfs_filter->target_idx;
+ } else if (filter->type == DAMOS_FILTER_TYPE_HUGEPAGE_SIZE) {
+ if (sysfs_filter->sz_range.min >
+ sysfs_filter->sz_range.max) {
+ damos_destroy_filter(filter);
+ return -EINVAL;
+ }
+ filter->sz_range = sysfs_filter->sz_range;
}
damos_add_filter(scheme, filter);
@@ -1965,8 +2151,17 @@ static int damos_sysfs_add_quota_score(
sysfs_goal->target_value);
if (!goal)
return -ENOMEM;
- if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+ switch (sysfs_goal->metric) {
+ case DAMOS_QUOTA_USER_INPUT:
goal->current_value = sysfs_goal->current_value;
+ break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
+ default:
+ break;
+ }
damos_add_quota_goal(quota, goal);
}
return 0;
@@ -2035,8 +2230,6 @@ static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
- struct damon_sysfs_scheme_filters *sysfs_filters =
- sysfs_scheme->filters;
struct damos *scheme;
int err;
@@ -2076,7 +2269,17 @@ static struct damos *damon_sysfs_mk_scheme(
return NULL;
}
- err = damon_sysfs_add_scheme_filters(scheme, sysfs_filters);
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->core_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->ops_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->filters);
if (err) {
damon_destroy_scheme(scheme);
return NULL;
@@ -2122,32 +2325,32 @@ void damon_sysfs_schemes_update_stats(
sysfs_stats->sz_tried = scheme->stat.sz_tried;
sysfs_stats->nr_applied = scheme->stat.nr_applied;
sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->sz_ops_filter_passed =
+ scheme->stat.sz_ops_filter_passed;
sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
}
}
-/*
- * damon_sysfs_schemes that need to update its schemes regions dir. Protected
- * by damon_sysfs_lock
- */
-static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
-static int damon_sysfs_schemes_region_idx;
-static bool damos_regions_upd_total_bytes_only;
-
-/*
- * DAMON callback that called before damos apply. While this callback is
- * registered, damon_sysfs_lock should be held to ensure the regions
- * directories exist.
+/**
+ * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir.
+ * @sysfs_schemes: Schemes directory to populate regions directory.
+ * @ctx: Corresponding DAMON context.
+ * @t: DAMON target of @r.
+ * @r: DAMON region to populate the directory for.
+ * @s: Corresponding scheme.
+ * @total_bytes_only: Whether the request is for bytes update only.
+ * @sz_filter_passed: Bytes of @r that passed filters of @s.
+ *
+ * Called from DAMOS walk callback while holding damon_sysfs_lock.
*/
-static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- struct damos *s)
+void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s, bool total_bytes_only,
+ unsigned long sz_filter_passed)
{
struct damos *scheme;
struct damon_sysfs_scheme_regions *sysfs_regions;
struct damon_sysfs_scheme_region *region;
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
int schemes_idx = 0;
damon_for_each_scheme(scheme, ctx) {
@@ -2158,152 +2361,39 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
/* user could have removed the scheme sysfs dir */
if (schemes_idx >= sysfs_schemes->nr)
- return 0;
+ return;
sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
- if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED)
- return 0;
- if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE)
- sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED;
sysfs_regions->total_bytes += r->ar.end - r->ar.start;
- if (damos_regions_upd_total_bytes_only)
- return 0;
+ if (total_bytes_only)
+ return;
region = damon_sysfs_scheme_region_alloc(r);
if (!region)
- return 0;
+ return;
+ region->sz_filter_passed = sz_filter_passed;
list_add_tail(&region->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
if (kobject_init_and_add(&region->kobj,
&damon_sysfs_scheme_region_ktype,
&sysfs_regions->kobj, "%d",
- damon_sysfs_schemes_region_idx++)) {
+ sysfs_regions->nr_regions++)) {
kobject_put(&region->kobj);
}
- return 0;
}
-/*
- * DAMON callback that called after each accesses sampling. While this
- * callback is registered, damon_sysfs_lock should be held to ensure the
- * regions directories exist.
- */
-void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
+int damon_sysfs_schemes_clear_regions(
+ struct damon_sysfs_schemes *sysfs_schemes)
{
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
- struct damon_sysfs_scheme_regions *sysfs_regions;
int i;
for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- if (sysfs_regions->upd_status ==
- DAMOS_TRIED_REGIONS_UPD_STARTED ||
- time_after(jiffies,
- sysfs_regions->upd_timeout_jiffies))
- sysfs_regions->upd_status =
- DAMOS_TRIED_REGIONS_UPD_FINISHED;
- }
-}
-
-/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_clear_regions(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
-{
- struct damos *scheme;
- int schemes_idx = 0;
-
- damon_for_each_scheme(scheme, ctx) {
struct damon_sysfs_scheme *sysfs_scheme;
- /* user could have removed the scheme sysfs dir */
- if (schemes_idx >= sysfs_schemes->nr)
- break;
-
- sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+ sysfs_scheme = sysfs_schemes->schemes_arr[i];
damon_sysfs_scheme_regions_rm_dirs(
sysfs_scheme->tried_regions);
sysfs_scheme->tried_regions->total_bytes = 0;
}
return 0;
}
-
-static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx)
-{
- struct damos *scheme;
- int i = 0;
-
- damon_for_each_scheme(scheme, ctx) {
- if (i == n)
- return scheme;
- i++;
- }
- return NULL;
-}
-
-static void damos_tried_regions_init_upd_status(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
-{
- int i;
- struct damos *scheme;
- struct damon_sysfs_scheme_regions *sysfs_regions;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- scheme = damos_sysfs_nth_scheme(i, ctx);
- if (!scheme) {
- sysfs_regions->upd_status =
- DAMOS_TRIED_REGIONS_UPD_FINISHED;
- continue;
- }
- sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
- sysfs_regions->upd_timeout_jiffies = jiffies +
- 2 * usecs_to_jiffies(scheme->apply_interval_us ?
- scheme->apply_interval_us :
- ctx->attrs.aggr_interval);
- }
-}
-
-/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_update_regions_start(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx, bool total_bytes_only)
-{
- damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
- damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
- damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
- damos_regions_upd_total_bytes_only = total_bytes_only;
- ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
- return 0;
-}
-
-bool damos_sysfs_regions_upd_done(void)
-{
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
- struct damon_sysfs_scheme_regions *sysfs_regions;
- int i;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- if (sysfs_regions->upd_status !=
- DAMOS_TRIED_REGIONS_UPD_FINISHED)
- return false;
- }
- return true;
-}
-
-/*
- * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller
- * should unlock damon_sysfs_lock which held before
- * damon_sysfs_schemes_update_regions_start()
- */
-int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
-{
- damon_sysfs_schemes_for_damos_callback = NULL;
- ctx->callback.before_damos_apply = NULL;
- damon_sysfs_schemes_region_idx = 0;
- return 0;
-}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 58145d59881d..1af6aff35d84 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -409,6 +409,164 @@ static const struct kobj_type damon_sysfs_targets_ktype = {
};
/*
+ * intervals goal directory
+ */
+
+struct damon_sysfs_intervals_goal {
+ struct kobject kobj;
+ unsigned long access_bp;
+ unsigned long aggrs;
+ unsigned long min_sample_us;
+ unsigned long max_sample_us;
+};
+
+static struct damon_sysfs_intervals_goal *damon_sysfs_intervals_goal_alloc(
+ unsigned long access_bp, unsigned long aggrs,
+ unsigned long min_sample_us, unsigned long max_sample_us)
+{
+ struct damon_sysfs_intervals_goal *goal = kmalloc(sizeof(*goal),
+ GFP_KERNEL);
+
+ if (!goal)
+ return NULL;
+
+ goal->kobj = (struct kobject){};
+ goal->access_bp = access_bp;
+ goal->aggrs = aggrs;
+ goal->min_sample_us = min_sample_us;
+ goal->max_sample_us = max_sample_us;
+ return goal;
+}
+
+static ssize_t access_bp_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->access_bp);
+}
+
+static ssize_t access_bp_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->access_bp = nr;
+ return count;
+}
+
+static ssize_t aggrs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->aggrs);
+}
+
+static ssize_t aggrs_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->aggrs = nr;
+ return count;
+}
+
+static ssize_t min_sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->min_sample_us);
+}
+
+static ssize_t min_sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->min_sample_us = nr;
+ return count;
+}
+
+static ssize_t max_sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->max_sample_us);
+}
+
+static ssize_t max_sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->max_sample_us = nr;
+ return count;
+}
+
+static void damon_sysfs_intervals_goal_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_intervals_goal, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_intervals_goal_access_bp_attr =
+ __ATTR_RW_MODE(access_bp, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_aggrs_attr =
+ __ATTR_RW_MODE(aggrs, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_min_sample_us_attr =
+ __ATTR_RW_MODE(min_sample_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_max_sample_us_attr =
+ __ATTR_RW_MODE(max_sample_us, 0600);
+
+static struct attribute *damon_sysfs_intervals_goal_attrs[] = {
+ &damon_sysfs_intervals_goal_access_bp_attr.attr,
+ &damon_sysfs_intervals_goal_aggrs_attr.attr,
+ &damon_sysfs_intervals_goal_min_sample_us_attr.attr,
+ &damon_sysfs_intervals_goal_max_sample_us_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_intervals_goal);
+
+static const struct kobj_type damon_sysfs_intervals_goal_ktype = {
+ .release = damon_sysfs_intervals_goal_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_intervals_goal_groups,
+};
+
+/*
* intervals directory
*/
@@ -417,6 +575,7 @@ struct damon_sysfs_intervals {
unsigned long sample_us;
unsigned long aggr_us;
unsigned long update_us;
+ struct damon_sysfs_intervals_goal *intervals_goal;
};
static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
@@ -436,6 +595,32 @@ static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
return intervals;
}
+static int damon_sysfs_intervals_add_dirs(struct damon_sysfs_intervals *intervals)
+{
+ struct damon_sysfs_intervals_goal *goal;
+ int err;
+
+ goal = damon_sysfs_intervals_goal_alloc(0, 0, 0, 0);
+ if (!goal)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&goal->kobj,
+ &damon_sysfs_intervals_goal_ktype, &intervals->kobj,
+ "intervals_goal");
+ if (err) {
+ kobject_put(&goal->kobj);
+ intervals->intervals_goal = NULL;
+ return err;
+ }
+ intervals->intervals_goal = goal;
+ return 0;
+}
+
+static void damon_sysfs_intervals_rm_dirs(struct damon_sysfs_intervals *intervals)
+{
+ kobject_put(&intervals->intervals_goal->kobj);
+}
+
static ssize_t sample_us_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -571,6 +756,9 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
"intervals");
if (err)
goto put_intervals_out;
+ err = damon_sysfs_intervals_add_dirs(intervals);
+ if (err)
+ goto put_intervals_out;
attrs->intervals = intervals;
nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
@@ -599,6 +787,7 @@ put_intervals_out:
static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
{
kobject_put(&attrs->nr_regions_range->kobj);
+ damon_sysfs_intervals_rm_dirs(attrs->intervals);
kobject_put(&attrs->intervals->kobj);
}
@@ -1025,6 +1214,11 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring
+ * intevals.
+ */
+ DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS,
+ /*
* @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
*/
NR_DAMON_SYSFS_CMDS,
@@ -1041,27 +1235,9 @@ static const char * const damon_sysfs_cmd_strs[] = {
"update_schemes_tried_regions",
"clear_schemes_tried_regions",
"update_schemes_effective_quotas",
+ "update_tuned_intervals",
};
-/*
- * struct damon_sysfs_cmd_request - A request to the DAMON callback.
- * @cmd: The command that needs to be handled by the callback.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
- *
- * This structure represents a sysfs command request that need to access some
- * DAMON context-internal data. Because DAMON context-internal data can be
- * safely accessed from DAMON callbacks without additional synchronization, the
- * request will be handled by the DAMON callback. None-``NULL`` @kdamond means
- * the request is valid.
- */
-struct damon_sysfs_cmd_request {
- enum damon_sysfs_cmd cmd;
- struct damon_sysfs_kdamond *kdamond;
-};
-
-/* Current DAMON callback request. Protected by damon_sysfs_lock. */
-static struct damon_sysfs_cmd_request damon_sysfs_cmd_request;
-
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -1084,11 +1260,18 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
struct damon_sysfs_attrs *sys_attrs)
{
struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
+ struct damon_sysfs_intervals_goal *sys_goal =
+ sys_intervals->intervals_goal;
struct damon_sysfs_ul_range *sys_nr_regions =
sys_attrs->nr_regions_range;
struct damon_attrs attrs = {
.sample_interval = sys_intervals->sample_us,
.aggr_interval = sys_intervals->aggr_us,
+ .intervals_goal = {
+ .access_bp = sys_goal->access_bp,
+ .aggrs = sys_goal->aggrs,
+ .min_sample_us = sys_goal->min_sample_us,
+ .max_sample_us = sys_goal->max_sample_us},
.ops_update_interval = sys_intervals->update_us,
.min_nr_regions = sys_nr_regions->min,
.max_nr_regions = sys_nr_regions->max,
@@ -1181,25 +1364,9 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx,
return 0;
}
-static bool damon_sysfs_schemes_regions_updating;
-
static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- struct damon_sysfs_kdamond *kdamond;
- enum damon_sysfs_cmd cmd;
-
- /* damon_sysfs_schemes_update_regions_stop() might not yet called */
- kdamond = damon_sysfs_cmd_request.kdamond;
- cmd = damon_sysfs_cmd_request.cmd;
- if (kdamond && ctx == kdamond->damon_ctx &&
- (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
- cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) &&
- damon_sysfs_schemes_regions_updating) {
- damon_sysfs_schemes_update_regions_stop(ctx);
- damon_sysfs_schemes_regions_updating = false;
- mutex_unlock(&damon_sysfs_lock);
- }
if (!damon_target_has_pid(ctx))
return;
@@ -1214,57 +1381,24 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
/*
* damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ * @data: The kobject wrapper that associated to the kdamond thread.
*
* This function reads the schemes stats of specific kdamond and update the
* related values for sysfs files. This function should be called from DAMON
- * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
- * contexts-internal data and DAMON sysfs variables.
+ * worker thread,to safely access the DAMON contexts-internal data. Caller
+ * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is
+ * not NULL but a valid pointer, to safely access DAMON sysfs variables.
*/
-static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_upd_schemes_stats(void *data)
{
+ struct damon_sysfs_kdamond *kdamond = data;
struct damon_ctx *ctx = kdamond->damon_ctx;
- if (!ctx)
- return -EINVAL;
damon_sysfs_schemes_update_stats(
kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
-static int damon_sysfs_upd_schemes_regions_start(
- struct damon_sysfs_kdamond *kdamond, bool total_bytes_only)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_update_regions_start(
- kdamond->contexts->contexts_arr[0]->schemes, ctx,
- total_bytes_only);
-}
-
-static int damon_sysfs_upd_schemes_regions_stop(
- struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_update_regions_stop(ctx);
-}
-
-static int damon_sysfs_clear_schemes_regions(
- struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_clear_regions(
- kdamond->contexts->contexts_arr[0]->schemes, ctx);
-}
-
static inline bool damon_sysfs_kdamond_running(
struct damon_sysfs_kdamond *kdamond)
{
@@ -1296,11 +1430,12 @@ static struct damon_ctx *damon_sysfs_build_ctx(
* damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
* @kdamond: The kobject wrapper for the associated kdamond.
*
- * If the sysfs input is wrong, the kdamond will be terminated.
+ * Returns error if the sysfs input is wrong.
*/
-static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_commit_input(void *data)
{
- struct damon_ctx *param_ctx;
+ struct damon_sysfs_kdamond *kdamond = data;
+ struct damon_ctx *param_ctx, *test_ctx;
int err;
if (!damon_sysfs_kdamond_running(kdamond))
@@ -1312,15 +1447,23 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
if (IS_ERR(param_ctx))
return PTR_ERR(param_ctx);
+ test_ctx = damon_new_ctx();
+ err = damon_commit_ctx(test_ctx, param_ctx);
+ if (err) {
+ damon_sysfs_destroy_targets(test_ctx);
+ damon_destroy_ctx(test_ctx);
+ goto out;
+ }
err = damon_commit_ctx(kdamond->damon_ctx, param_ctx);
+out:
damon_sysfs_destroy_targets(param_ctx);
damon_destroy_ctx(param_ctx);
return err;
}
-static int damon_sysfs_commit_schemes_quota_goals(
- struct damon_sysfs_kdamond *sysfs_kdamond)
+static int damon_sysfs_commit_schemes_quota_goals(void *data)
{
+ struct damon_sysfs_kdamond *sysfs_kdamond = data;
struct damon_ctx *ctx;
struct damon_sysfs_context *sysfs_ctx;
@@ -1338,128 +1481,33 @@ static int damon_sysfs_commit_schemes_quota_goals(
/*
* damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas
* sysfs files.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ * @data: The kobject wrapper that associated to the kdamond thread.
*
* This function reads the schemes' effective quotas of specific kdamond and
* update the related values for sysfs files. This function should be called
* from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the
* DAMON contexts-internal data and DAMON sysfs variables.
*/
-static int damon_sysfs_upd_schemes_effective_quotas(
- struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_upd_schemes_effective_quotas(void *data)
{
+ struct damon_sysfs_kdamond *kdamond = data;
struct damon_ctx *ctx = kdamond->damon_ctx;
- if (!ctx)
- return -EINVAL;
damos_sysfs_update_effective_quotas(
kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
-
-/*
- * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
- * @c: The DAMON context of the callback.
- * @active: Whether @c is not deactivated due to watermarks.
- * @after_aggr: Whether this is called from after_aggregation() callback.
- *
- * This function is periodically called back from the kdamond thread for @c.
- * Then, it checks if there is a waiting DAMON sysfs request and handles it.
- */
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
- bool after_aggregation)
-{
- struct damon_sysfs_kdamond *kdamond;
- bool total_bytes_only = false;
- int err = 0;
-
- /* avoid deadlock due to concurrent state_store('off') */
- if (!damon_sysfs_schemes_regions_updating &&
- !mutex_trylock(&damon_sysfs_lock))
- return 0;
- kdamond = damon_sysfs_cmd_request.kdamond;
- if (!kdamond || kdamond->damon_ctx != c)
- goto out;
- switch (damon_sysfs_cmd_request.cmd) {
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
- err = damon_sysfs_upd_schemes_stats(kdamond);
- break;
- case DAMON_SYSFS_CMD_COMMIT:
- if (!after_aggregation)
- goto out;
- err = damon_sysfs_commit_input(kdamond);
- break;
- case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
- err = damon_sysfs_commit_schemes_quota_goals(kdamond);
- break;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
- total_bytes_only = true;
- fallthrough;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
- if (!damon_sysfs_schemes_regions_updating) {
- err = damon_sysfs_upd_schemes_regions_start(kdamond,
- total_bytes_only);
- if (!err) {
- damon_sysfs_schemes_regions_updating = true;
- goto keep_lock_out;
- }
- } else {
- damos_sysfs_mark_finished_regions_updates(c);
- /*
- * Continue regions updating if DAMON is till
- * active and the update for all schemes is not
- * finished.
- */
- if (active && !damos_sysfs_regions_upd_done())
- goto keep_lock_out;
- err = damon_sysfs_upd_schemes_regions_stop(kdamond);
- damon_sysfs_schemes_regions_updating = false;
- }
- break;
- case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
- err = damon_sysfs_clear_schemes_regions(kdamond);
- break;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
- err = damon_sysfs_upd_schemes_effective_quotas(kdamond);
- break;
- default:
- break;
- }
- /* Mark the request as invalid now. */
- damon_sysfs_cmd_request.kdamond = NULL;
-out:
- if (!damon_sysfs_schemes_regions_updating)
- mutex_unlock(&damon_sysfs_lock);
-keep_lock_out:
- return err;
-}
-
-static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
+static int damon_sysfs_upd_tuned_intervals(void *data)
{
- /*
- * after_wmarks_check() is called back while the context is deactivated
- * by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, false, false);
-}
-
-static int damon_sysfs_after_sampling(struct damon_ctx *c)
-{
- /*
- * after_sampling() is called back only while the context is not
- * deactivated by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, true, false);
-}
+ struct damon_sysfs_kdamond *kdamond = data;
+ struct damon_ctx *ctx = kdamond->damon_ctx;
-static int damon_sysfs_after_aggregation(struct damon_ctx *c)
-{
- /*
- * after_aggregation() is called back only while the context is not
- * deactivated by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, true, true);
+ kdamond->contexts->contexts_arr[0]->attrs->intervals->sample_us =
+ ctx->attrs.sample_interval;
+ kdamond->contexts->contexts_arr[0]->attrs->intervals->aggr_us =
+ ctx->attrs.aggr_interval;
+ return 0;
}
static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1477,9 +1525,6 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ERR_PTR(err);
}
- ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
- ctx->callback.after_sampling = damon_sysfs_after_sampling;
- ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
}
@@ -1491,8 +1536,6 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
if (damon_sysfs_kdamond_running(kdamond))
return -EBUSY;
- if (damon_sysfs_cmd_request.kdamond == kdamond)
- return -EBUSY;
/* TODO: support multiple contexts per kdamond */
if (kdamond->contexts->nr != 1)
return -EINVAL;
@@ -1525,63 +1568,102 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
*/
}
+static int damon_sysfs_damon_call(int (*fn)(void *data),
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_call_control call_control = {};
+
+ if (!kdamond->damon_ctx)
+ return -EINVAL;
+ call_control.fn = fn;
+ call_control.data = kdamond;
+ return damon_call(kdamond->damon_ctx, &call_control);
+}
+
+struct damon_sysfs_schemes_walk_data {
+ struct damon_sysfs_kdamond *sysfs_kdamond;
+ bool total_bytes_only;
+};
+
+/* populate the region directory */
+static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *s, unsigned long sz_filter_passed)
+{
+ struct damon_sysfs_schemes_walk_data *walk_data = data;
+ struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond;
+
+ damos_sysfs_populate_region_dir(
+ sysfs_kdamond->contexts->contexts_arr[0]->schemes,
+ ctx, t, r, s, walk_data->total_bytes_only,
+ sz_filter_passed);
+}
+
+static int damon_sysfs_update_schemes_tried_regions(
+ struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only)
+{
+ struct damon_sysfs_schemes_walk_data walk_data = {
+ .sysfs_kdamond = sysfs_kdamond,
+ .total_bytes_only = total_bytes_only,
+ };
+ struct damos_walk_control control = {
+ .walk_fn = damon_sysfs_schemes_tried_regions_upd_one,
+ .data = &walk_data,
+ };
+ struct damon_ctx *ctx = sysfs_kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+
+ damon_sysfs_schemes_clear_regions(
+ sysfs_kdamond->contexts->contexts_arr[0]->schemes);
+ return damos_walk(ctx, &control);
+}
+
/*
* damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
* @cmd: The command to handle.
* @kdamond: The kobject wrapper for the associated kdamond.
*
- * This function handles a DAMON sysfs command for a kdamond. For commands
- * that need to access running DAMON context-internal data, it requests
- * handling of the command to the DAMON callback
- * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled,
- * or the context is completed.
+ * This function handles a DAMON sysfs command for a kdamond.
*
* Return: 0 on success, negative error code otherwise.
*/
static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
struct damon_sysfs_kdamond *kdamond)
{
- bool need_wait = true;
-
- /* Handle commands that doesn't access DAMON context-internal data */
switch (cmd) {
case DAMON_SYSFS_CMD_ON:
return damon_sysfs_turn_damon_on(kdamond);
case DAMON_SYSFS_CMD_OFF:
return damon_sysfs_turn_damon_off(kdamond);
+ case DAMON_SYSFS_CMD_COMMIT:
+ return damon_sysfs_damon_call(
+ damon_sysfs_commit_input, kdamond);
+ case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_commit_schemes_quota_goals,
+ kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_schemes_stats, kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
+ return damon_sysfs_update_schemes_tried_regions(kdamond, true);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+ return damon_sysfs_update_schemes_tried_regions(kdamond, false);
+ case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+ return damon_sysfs_schemes_clear_regions(
+ kdamond->contexts->contexts_arr[0]->schemes);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_schemes_effective_quotas,
+ kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_tuned_intervals, kdamond);
default:
- break;
- }
-
- /* Pass the command to DAMON callback for safe DAMON context access */
- if (damon_sysfs_cmd_request.kdamond)
- return -EBUSY;
- if (!damon_sysfs_kdamond_running(kdamond))
return -EINVAL;
- damon_sysfs_cmd_request.cmd = cmd;
- damon_sysfs_cmd_request.kdamond = kdamond;
-
- /*
- * wait until damon_sysfs_cmd_request_callback() handles the request
- * from kdamond context
- */
- mutex_unlock(&damon_sysfs_lock);
- while (need_wait) {
- schedule_timeout_idle(msecs_to_jiffies(100));
- if (!mutex_trylock(&damon_sysfs_lock))
- continue;
- if (!damon_sysfs_cmd_request.kdamond) {
- /* damon_sysfs_cmd_request_callback() handled */
- need_wait = false;
- } else if (!damon_sysfs_kdamond_running(kdamond)) {
- /* kdamond has already finished */
- need_wait = false;
- damon_sysfs_cmd_request.kdamond = NULL;
- }
- mutex_unlock(&damon_sysfs_lock);
}
- mutex_lock(&damon_sysfs_lock);
- return 0;
}
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1693,8 +1775,7 @@ static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
int i;
for (i = 0; i < nr_kdamonds; i++) {
- if (damon_sysfs_kdamond_running(kdamonds[i]) ||
- damon_sysfs_cmd_request.kdamond == kdamonds[i])
+ if (damon_sysfs_kdamond_running(kdamonds[i]))
return true;
}
diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig
index a73be044fc9b..36a450f57b58 100644
--- a/mm/damon/tests/.kunitconfig
+++ b/mm/damon/tests/.kunitconfig
@@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y
CONFIG_SYSFS=y
CONFIG_DAMON_SYSFS=y
CONFIG_DAMON_SYSFS_KUNIT_TEST=y
-
-# for DAMON debugfs interface
-CONFIG_DEBUG_FS=y
-CONFIG_DAMON_PADDR=y
-CONFIG_DAMON_DBGFS_DEPRECATED=y
-CONFIG_DAMON_DBGFS=y
-CONFIG_DAMON_DBGFS_KUNIT_TEST=y
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index cf22e09a3507..298c67557fae 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -348,19 +348,19 @@ static void damon_test_update_monitoring_result(struct kunit *test)
new_attrs = (struct damon_attrs){
.sample_interval = 100, .aggr_interval = 10000,};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 15);
KUNIT_EXPECT_EQ(test, r->age, 2);
new_attrs = (struct damon_attrs){
.sample_interval = 1, .aggr_interval = 1000};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 2);
new_attrs = (struct damon_attrs){
.sample_interval = 1, .aggr_interval = 100};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 20);
@@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test)
{
struct damos_filter *filter;
- filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
KUNIT_EXPECT_EQ(test, filter->matching, true);
KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
@@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test)
struct damon_region *r, *r2;
struct damos_filter *f;
- f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true);
+ f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false);
f->addr_range = (struct damon_addr_range){
.start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6};
@@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
@@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test)
damon_feed_loop_next_input(last_input, 2000));
}
+static void damon_test_set_filters_default_reject(struct kunit *test)
+{
+ struct damos scheme;
+ struct damos_filter *target_filter, *anon_filter;
+
+ INIT_LIST_HEAD(&scheme.filters);
+ INIT_LIST_HEAD(&scheme.ops_filters);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * No filter is installed. Allow by default on both core and ops layer
+ * filtering stages, since there are no filters at all.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true);
+ damos_add_filter(&scheme, target_filter);
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter is installed.
+ * Rejct by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter->allow = false;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter is installed.
+ * Allow by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true);
+ damos_add_filter(&scheme, anon_filter);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter and ops-handled allow-filter are installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+
+ target_filter->allow = true;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter and ops-handled allow-filter are
+ * installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damos_test_new_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
+ KUNIT_CASE(damon_test_set_filters_default_reject),
{},
};
diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h
deleted file mode 100644
index 087e53f641a8..000000000000
--- a/mm/damon/tests/dbgfs-kunit.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * DAMON Debugfs Interface Unit Tests
- *
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST
-
-#ifndef _DAMON_DBGFS_TEST_H
-#define _DAMON_DBGFS_TEST_H
-
-#include <kunit/test.h>
-
-static void damon_dbgfs_test_str_to_ints(struct kunit *test)
-{
- char *question;
- int *answers;
- int expected[] = {12, 35, 46};
- ssize_t nr_integers = 0, i;
-
- question = "123";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123, answers[0]);
- kfree(answers);
-
- question = "123abc";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123, answers[0]);
- kfree(answers);
-
- question = "a123";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-
- question = "12 35";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
- for (i = 0; i < nr_integers; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "12 35 46";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
- for (i = 0; i < nr_integers; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "12 35 abc 46";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
- for (i = 0; i < 2; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-
- question = "\n";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-}
-
-static void damon_dbgfs_test_set_targets(struct kunit *test)
-{
- struct damon_ctx *ctx = dbgfs_new_ctx();
- char buf[64];
-
- if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
- dbgfs_destroy_ctx(ctx);
- kunit_skip(test, "PADDR not registered");
- }
-
- /* Make DAMON consider target has no pid */
- damon_select_ops(ctx, DAMON_OPS_PADDR);
-
- dbgfs_set_targets(ctx, 0, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
-
- dbgfs_set_targets(ctx, 1, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
-
- dbgfs_set_targets(ctx, 0, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
-
- dbgfs_destroy_ctx(ctx);
-}
-
-static void damon_dbgfs_test_set_init_regions(struct kunit *test)
-{
- struct damon_ctx *ctx = damon_new_ctx();
- /* Each line represents one region in ``<target idx> <start> <end>`` */
- char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45",
- "1 10 20\n",
- "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n",
- ""};
- /* Reading the file again will show sorted, clean output */
- char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
- "1 10 20\n",
- "0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
- ""};
- char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */
- "1 10 20\n 1 14 26\n", /* regions overlap */
- "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */
- char *input, *expect;
- int i, rc;
- char buf[256];
-
- if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- kunit_skip(test, "PADDR not registered");
- }
-
- damon_select_ops(ctx, DAMON_OPS_PADDR);
-
- dbgfs_set_targets(ctx, 3, NULL);
-
- /* Put valid inputs and check the results */
- for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
- input = valid_inputs[i];
- expect = valid_expects[i];
-
- rc = set_init_regions(ctx, input, strnlen(input, 256));
- KUNIT_EXPECT_EQ(test, rc, 0);
-
- memset(buf, 0, 256);
- sprint_init_regions(ctx, buf, 256);
-
- KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
- }
- /* Put invalid inputs and check the return error code */
- for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
- input = invalid_inputs[i];
- pr_info("input: %s\n", input);
- rc = set_init_regions(ctx, input, strnlen(input, 256));
- KUNIT_EXPECT_EQ(test, rc, -EINVAL);
-
- memset(buf, 0, 256);
- sprint_init_regions(ctx, buf, 256);
-
- KUNIT_EXPECT_STREQ(test, (char *)buf, "");
- }
-
- dbgfs_set_targets(ctx, 0, NULL);
- damon_destroy_ctx(ctx);
-}
-
-static struct kunit_case damon_test_cases[] = {
- KUNIT_CASE(damon_dbgfs_test_str_to_ints),
- KUNIT_CASE(damon_dbgfs_test_set_targets),
- KUNIT_CASE(damon_dbgfs_test_set_init_regions),
- {},
-};
-
-static struct kunit_suite damon_test_suite = {
- .name = "damon-dbgfs",
- .test_cases = damon_test_cases,
-};
-kunit_test_suite(damon_test_suite);
-
-#endif /* _DAMON_DBGFS_TEST_H */
-
-#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index b9fe3bc8472b..7cd944266a92 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
static struct mm_struct mm;
struct damon_addr_range regions[3] = {0};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
- struct vm_area_struct vmas[] = {
+ static struct vm_area_struct vmas[] = {
(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
(struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
(struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b9eaa20b73b9..46554e49a478 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for Virtual Address Spaces
+ * DAMON Code for Virtual Address Spaces
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -655,7 +655,7 @@ static unsigned long damos_madvise(struct damon_target *target,
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+ struct damos *scheme, unsigned long *sz_filter_passed)
{
int madv_action;
@@ -710,7 +710,6 @@ static int __init damon_va_initcall(void)
.update = damon_va_update,
.prepare_access_checks = damon_va_prepare_access_checks,
.check_accesses = damon_va_check_accesses,
- .reset_aggregated = NULL,
.target_valid = damon_va_target_valid,
.cleanup = NULL,
.apply_scheme = damon_va_apply_scheme,
diff --git a/mm/debug.c b/mm/debug.c
index aa57d3ffd4ed..907382257062 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -71,20 +71,27 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = atomic_read(&page->_mapcount);
+ int mapcount = atomic_read(&page->_mapcount) + 1;
char *type = "";
- mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
if (folio_test_large(folio)) {
+ int pincount = 0;
+
+ if (folio_has_pincount(folio))
+ pincount = atomic_read(&folio->_pincount);
+
pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
folio_order(folio),
folio_mapcount(folio),
folio_entire_mapcount(folio),
folio_nr_pages_mapped(folio),
- atomic_read(&folio->_pincount));
+ pincount);
}
#ifdef CONFIG_MEMCG
@@ -124,25 +131,31 @@ static void __dump_page(const struct page *page)
{
struct folio *foliop, folio;
struct page precise;
+ unsigned long head;
unsigned long pfn = page_to_pfn(page);
unsigned long idx, nr_pages = 1;
int loops = 5;
again:
memcpy(&precise, page, sizeof(*page));
- foliop = page_folio(&precise);
- if (foliop == (struct folio *)&precise) {
+ head = precise.compound_head;
+ if ((head & 1) == 0) {
+ foliop = (struct folio *)&precise;
idx = 0;
if (!folio_test_large(foliop))
goto dump;
foliop = (struct folio *)page;
} else {
+ foliop = (struct folio *)(head - 1);
idx = folio_page_idx(foliop, page);
}
if (idx < MAX_FOLIO_NR_PAGES) {
memcpy(&folio, foliop, 2 * sizeof(struct page));
nr_pages = folio_nr_pages(&folio);
+ if (nr_pages > 1)
+ memcpy(&folio.__page_2, &foliop->__page_2,
+ sizeof(struct page));
foliop = &folio;
}
@@ -162,7 +175,7 @@ dump:
void dump_page(const struct page *page, const char *reason)
{
if (PagePoisoned(page))
- pr_warn("page:%p is uninitialized and poisoned", page);
+ pr_warn("page:%p is uninitialized and poisoned\n", page);
else
__dump_page(page);
if (reason)
@@ -178,11 +191,17 @@ void dump_vma(const struct vm_area_struct *vma)
pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
+#ifdef CONFIG_PER_VMA_LOCK
+ "refcnt %x\n"
+#endif
"flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
+#ifdef CONFIG_PER_VMA_LOCK
+ refcount_read(&vma->vm_refcnt),
+#endif
vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -246,6 +265,83 @@ void dump_mm(const struct mm_struct *mm)
}
EXPORT_SYMBOL(dump_mm);
+void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
+{
+ if (reason)
+ pr_warn("vmg %px dumped because: %s\n", vmg, reason);
+
+ if (!vmg) {
+ pr_warn("vmg %px state: (NULL)\n", vmg);
+ return;
+ }
+
+ pr_warn("vmg %px state: mm %px pgoff %lx\n"
+ "vmi %px [%lx,%lx)\n"
+ "prev %px middle %px next %px target %px\n"
+ "start %lx end %lx flags %lx\n"
+ "file %px anon_vma %px policy %px\n"
+ "uffd_ctx %px\n"
+ "anon_name %px\n"
+ "state %x\n"
+ "just_expand %d\n"
+ "__adjust_middle_start %d __adjust_next_start %d\n"
+ "__remove_middle %d __remove_next %d\n",
+ vmg, vmg->mm, vmg->pgoff,
+ vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
+ vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
+ vmg->prev, vmg->middle, vmg->next, vmg->target,
+ vmg->start, vmg->end, vmg->flags,
+ vmg->file, vmg->anon_vma, vmg->policy,
+#ifdef CONFIG_USERFAULTFD
+ vmg->uffd_ctx.ctx,
+#else
+ (void *)0,
+#endif
+ vmg->anon_name,
+ (int)vmg->state,
+ vmg->just_expand,
+ vmg->__adjust_middle_start, vmg->__adjust_next_start,
+ vmg->__remove_middle, vmg->__remove_next);
+
+ if (vmg->mm) {
+ pr_warn("vmg %px mm:\n", vmg);
+ dump_mm(vmg->mm);
+ } else {
+ pr_warn("vmg %px mm: (NULL)\n", vmg);
+ }
+
+ if (vmg->prev) {
+ pr_warn("vmg %px prev:\n", vmg);
+ dump_vma(vmg->prev);
+ } else {
+ pr_warn("vmg %px prev: (NULL)\n", vmg);
+ }
+
+ if (vmg->middle) {
+ pr_warn("vmg %px middle:\n", vmg);
+ dump_vma(vmg->middle);
+ } else {
+ pr_warn("vmg %px middle: (NULL)\n", vmg);
+ }
+
+ if (vmg->next) {
+ pr_warn("vmg %px next:\n", vmg);
+ dump_vma(vmg->next);
+ } else {
+ pr_warn("vmg %px next: (NULL)\n", vmg);
+ }
+
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ if (vmg->vmi) {
+ pr_warn("vmg %px vmi:\n", vmg);
+ vma_iter_dump_tree(vmg->vmi);
+ } else {
+ pr_warn("vmg %px vmi: (NULL)\n", vmg);
+ }
+#endif
+}
+EXPORT_SYMBOL(dump_vmg);
+
static bool page_init_poisoning __read_mostly = true;
static int __init setup_vm_debug(char *str)
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index d46acf989dde..6a26eca546c3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value: %s\n", buf);
return 0;
}
_debug_guardpage_minorder = res;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bc748f700a9e..7731b238b534 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -910,26 +910,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_HUGETLB_PAGE
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
- struct page *page;
pte_t pte;
pr_debug("Validating HugeTLB basic\n");
- /*
- * Accessing the page associated with the pfn is safe here,
- * as it was previously derived from a real kernel symbol.
- */
- page = pfn_to_page(args->fixed_pmd_pfn);
- pte = mk_huge_pte(page, args->page_prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
+ pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS);
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ WARN_ON(!pte_huge(pte));
+#endif
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
-
-#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
-
- WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5be8cc1c6529 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index ce06b2884789..ff35b84a7b50 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size,
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
-void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+/*
+ * If no empty slot, handle that and return -ENOMEM.
+ */
+int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
{
unsigned long slop, clen;
char *p;
@@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
if (clen > MAX_MAP_CHUNK - slop)
clen = MAX_MAP_CHUNK - slop;
p = early_memremap(src & PAGE_MASK, clen + slop);
+ if (!p)
+ return -ENOMEM;
memcpy(dest, p + slop, clen);
early_memunmap(p, clen + slop);
dest += clen;
src += clen;
size -= clen;
}
+ return 0;
}
#else /* CONFIG_MMU */
diff --git a/mm/execmem.c b/mm/execmem.c
index 317b6a8d35be..2b683e7d864d 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -257,7 +257,6 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
- unsigned long start, end;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- start = (unsigned long)p;
- end = start + alloc_size;
-
- vunmap_range(start, end);
-
- err = execmem_set_direct_map_valid(vm, false);
- if (err)
- goto err_free_mem;
-
- err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
- PMD_SHIFT);
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)
- goto err_free_mem;
+ goto err_reset_direct_map;
return 0;
+err_reset_direct_map:
+ execmem_set_direct_map_valid(vm, true);
err_free_mem:
vfree(p);
return err;
@@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr)
return true;
}
+
+int execmem_make_temp_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
@@ -364,6 +377,8 @@ void *execmem_alloc(enum execmem_type type, size_t size)
pgprot_t pgprot = range->pgprot;
void *p;
+ size = PAGE_ALIGN(size);
+
if (use_cache)
p = execmem_cache_alloc(range, size);
else
diff --git a/mm/filemap.c b/mm/filemap.c
index 7c76a123ba18..bada249b9fb7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,6 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
+#include <linux/sysctl.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -124,15 +125,6 @@
* ->private_lock (zap_pte_range->block_dirty_folio)
*/
-static void mapping_set_update(struct xa_state *xas,
- struct address_space *mapping)
-{
- if (dax_mapping(mapping) || shmem_mapping(mapping))
- return;
- xas_set_update(xas, workingset_update_node);
- xas_set_lru(xas, &shadow_nodes);
-}
-
static void page_cache_delete(struct address_space *mapping,
struct folio *folio, void *shadow)
{
@@ -150,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
+ /* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@@ -235,15 +227,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
void (*free_folio)(struct folio *);
- int refs = 1;
free_folio = mapping->a_ops->free_folio;
if (free_folio)
free_folio(folio);
- if (folio_test_large(folio))
- refs = folio_nr_pages(folio);
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, folio_nr_pages(folio));
}
/**
@@ -450,6 +439,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
+ * filemap_fdatawrite_range_kick - start writeback on a range
+ * @mapping: target address_space
+ * @start: index to start writeback on
+ * @end: last (inclusive) index for writeback
+ *
+ * This is a non-integrity writeback helper, to start writing back folios
+ * for the indicated range.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
+
+/**
* filemap_flush - mostly a non-blocking flush
* @mapping: target address_space
*
@@ -850,11 +857,10 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio);
noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
- XA_STATE(xas, &mapping->i_pages, index);
- void *alloced_shadow = NULL;
- int alloced_order = 0;
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
bool huge;
long nr;
+ unsigned int forder = folio_order(folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
@@ -863,7 +869,6 @@ noinline int __filemap_add_folio(struct address_space *mapping,
mapping_set_update(&xas, mapping);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
- xas_set_order(&xas, index, folio_order(folio));
huge = folio_test_hugetlb(folio);
nr = folio_nr_pages(folio);
@@ -873,7 +878,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
folio->index = xas.xa_index;
for (;;) {
- int order = -1, split_order = 0;
+ int order = -1;
void *entry, *old = NULL;
xas_lock_irq(&xas);
@@ -891,21 +896,25 @@ noinline int __filemap_add_folio(struct address_space *mapping,
order = xas_get_order(&xas);
}
- /* entry may have changed before we re-acquire the lock */
- if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
-
if (old) {
- if (order > 0 && order > folio_order(folio)) {
+ if (order > 0 && order > forder) {
+ unsigned int split_order = max(forder,
+ xas_try_split_min_order(order));
+
/* How to handle large swap entries? */
BUG_ON(shmem_mapping(mapping));
- if (!alloced_order) {
- split_order = order;
- goto unlock;
+
+ while (order > forder) {
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, order);
+ if (xas_error(&xas))
+ goto unlock;
+ order = split_order;
+ split_order =
+ max(xas_try_split_min_order(
+ split_order),
+ forder);
}
- xas_split(&xas, old, order);
xas_reset(&xas);
}
if (shadowp)
@@ -929,17 +938,6 @@ noinline int __filemap_add_folio(struct address_space *mapping,
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
@@ -951,7 +949,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
+ /* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
@@ -1068,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio)
return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
+/* How many times do we accept lock stealing from under a waiter? */
+static int sysctl_page_lock_unfairness = 5;
+static const struct ctl_table filemap_sysctl_table[] = {
+ {
+ .procname = "page_lock_unfairness",
+ .data = &sysctl_page_lock_unfairness,
+ .maxlen = sizeof(sysctl_page_lock_unfairness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+};
+
void __init pagecache_init(void)
{
int i;
@@ -1076,6 +1087,7 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
+ register_sysctl_init("vm", filemap_sysctl_table);
}
/*
@@ -1223,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
return true;
}
-/* How many times do we accept lock stealing from under a waiter? */
-int sysctl_page_lock_unfairness = 5;
-
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
int state, enum behavior behavior)
{
@@ -1369,7 +1378,7 @@ repeat:
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
- * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
* should be called while holding the ptl for the migration entry referencing
* the page.
@@ -1473,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state)
}
/**
- * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
- * @folio: Folio defining the wait queue of interest
- * @waiter: Waiter to add to the queue
- *
- * Add an arbitrary @waiter to the wait queue for the nominated @folio.
- */
-void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
-{
- wait_queue_head_t *q = folio_waitqueue(folio);
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_entry_tail(q, waiter);
- folio_set_waiters(folio);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(folio_add_wait_queue);
-
-/**
* folio_unlock - Unlock a locked folio.
* @folio: The folio.
*
@@ -1532,7 +1522,7 @@ void folio_end_read(struct folio *folio, bool success)
/* Must be in bottom byte for x86 to work */
BUILD_BUG_ON(PG_uptodate > 7);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
+ VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);
if (likely(success))
mask |= 1 << PG_uptodate;
@@ -1599,6 +1589,43 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
+/*
+ * If folio was marked as dropbehind, then pages should be dropped when writeback
+ * completes. Do that now. If we fail, it's likely because of a big folio -
+ * just reset dropbehind for that case and latter completions should invalidate.
+ */
+static void filemap_end_dropbehind_write(struct folio *folio)
+{
+ if (!folio_test_dropbehind(folio))
+ return;
+
+ /*
+ * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
+ * but can happen if normal writeback just happens to find dirty folios
+ * that were created as part of uncached writeback, and that writeback
+ * would otherwise not need non-IRQ handling. Just skip the
+ * invalidation in that case.
+ */
+ if (in_task() && folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+}
+
/**
* folio_end_writeback - End writeback against a folio.
* @folio: The folio.
@@ -1632,6 +1659,8 @@ void folio_end_writeback(struct folio *folio)
folio_get(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
+
+ filemap_end_dropbehind_write(folio);
acct_reclaim_writeback(folio);
folio_put(folio);
}
@@ -1955,6 +1984,8 @@ no_page:
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
__folio_set_referenced(folio);
+ if (fgp_flags & FGP_DONTCACHE)
+ __folio_set_dropbehind(folio);
err = filemap_add_folio(mapping, folio, index, gfp);
if (!err)
@@ -1965,8 +1996,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -1977,6 +2019,9 @@ no_page:
if (!folio)
return ERR_PTR(-ENOENT);
+ /* not an uncached lookup, clear uncached if set */
+ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
+ folio_clear_dropbehind(folio);
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);
@@ -2210,6 +2255,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
@@ -2459,18 +2505,22 @@ unlock_mapping:
return error;
}
-static int filemap_create_folio(struct file *file,
- struct address_space *mapping, loff_t pos,
- struct folio_batch *fbatch)
+static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
struct folio *folio;
int error;
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t index;
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+ return -EAGAIN;
+
folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
if (!folio)
return -ENOMEM;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ __folio_set_dropbehind(folio);
/*
* Protect against truncate / hole punch. Grabbing invalidate_lock
@@ -2486,7 +2536,7 @@ static int filemap_create_folio(struct file *file,
* well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
- index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
+ index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
@@ -2494,7 +2544,8 @@ static int filemap_create_folio(struct file *file,
if (error)
goto error;
- error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
+ error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
+ folio);
if (error)
goto error;
@@ -2515,6 +2566,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ ractl.dropbehind = 1;
page_cache_async_ra(&ractl, folio, last_index - folio->index);
return 0;
}
@@ -2524,7 +2577,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
- struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
@@ -2539,20 +2591,21 @@ retry:
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
+ DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
+
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
if (iocb->ki_flags & IOCB_NOWAIT)
flags = memalloc_noio_save();
- page_cache_sync_readahead(mapping, ra, filp, index,
- last_index - index);
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ ractl.dropbehind = 1;
+ page_cache_sync_ra(&ractl, last_index - index);
if (iocb->ki_flags & IOCB_NOWAIT)
memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
- return -EAGAIN;
- err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
+ err = filemap_create_folio(iocb, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
@@ -2593,6 +2646,18 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
+static void filemap_end_dropbehind_read(struct folio *folio)
+{
+ if (!folio_test_dropbehind(folio))
+ return;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+}
+
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
@@ -2706,8 +2771,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
}
}
put_folios:
- for (i = 0; i < folio_batch_count(&fbatch); i++)
- folio_put(fbatch.folios[i]);
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ filemap_end_dropbehind_read(folio);
+ folio_put(folio);
+ }
folio_batch_init(&fbatch);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
@@ -2848,8 +2917,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
- while (spliced < size &&
- !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (spliced < size && !pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
@@ -2906,7 +2974,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -2966,7 +3034,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
goto out;
}
@@ -3005,7 +3073,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas,
if (ops->is_partially_uptodate(folio, offset, bsz) ==
seek_data)
break;
- start = (start + bsz) & ~(bsz - 1);
+ start = (start + bsz) & ~((u64)bsz - 1);
offset += bsz;
} while (offset < folio_size(folio));
unlock:
@@ -3474,7 +3542,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
struct page *page = folio_file_page(folio, start);
- vm_fault_t ret = do_set_pmd(vmf, page);
+ vm_fault_t ret = do_set_pmd(vmf, folio, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
folio_unlock(folio);
@@ -3501,10 +3569,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas,
continue;
if (xa_is_value(folio))
continue;
- if (folio_test_locked(folio))
- continue;
if (!folio_try_get(folio))
continue;
+ if (folio_test_locked(folio))
+ goto skip;
/* Has the page moved or been split? */
if (unlikely(folio != xas_reload(xas)))
goto skip;
@@ -4036,17 +4104,6 @@ retry:
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
- status = -EFAULT;
- break;
- }
-
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
@@ -4064,6 +4121,12 @@ retry:
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
@@ -4089,6 +4152,16 @@ retry:
bytes = copied;
goto retry;
}
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
} else {
pos += status;
written += status;
@@ -4385,6 +4458,20 @@ resched:
}
/*
+ * See mincore: reveal pagecache information only for files
+ * that the calling process has write access to, or could (if
+ * tried) open for writing.
+ */
+static inline bool can_do_cachestat(struct file *f)
+{
+ if (f->f_mode & FMODE_WRITE)
+ return true;
+ if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
+ return true;
+ return file_permission(f, MAY_WRITE) == 0;
+}
+
+/*
* The cachestat(2) system call.
*
* cachestat() returns the page cache statistics of a file in the
@@ -4439,6 +4526,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
if (is_file_hugepages(fd_file(f)))
return -EOPNOTSUPP;
+ if (!can_do_cachestat(fd_file(f)))
+ return -EPERM;
+
if (flags != 0)
return -EINVAL;
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 1d1832e2a599..45540942d148 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page)
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
-void wait_for_stable_page(struct page *page)
-{
- return folio_wait_stable(page_folio(page));
-}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
-
void mark_page_accessed(struct page *page)
{
folio_mark_accessed(page_folio(page));
@@ -90,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);
-
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index)
-{
- return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/gup.c b/mm/gup.c
index 746070a1d8bf..3c39cbbeebef 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "swap.h"
struct follow_page_context {
struct dev_pagemap *pgmap;
@@ -52,7 +53,12 @@ static inline void sanity_check_pinned_pages(struct page **pages,
*/
for (; npages; npages--, pages++) {
struct page *page = *pages;
- struct folio *folio = page_folio(page);
+ struct folio *folio;
+
+ if (!page)
+ continue;
+
+ folio = page_folio(page);
if (is_zero_page(page) ||
!folio_test_anon(folio))
@@ -91,8 +97,7 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -105,14 +110,13 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
if (is_zero_folio(folio))
return;
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
atomic_sub(refs, &folio->_pincount);
else
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
}
/**
@@ -161,7 +165,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
* Increment the normal page refcount field at least once,
* so that the page really is pinned.
*/
- if (folio_test_large(folio)) {
+ if (folio_has_pincount(folio)) {
folio_ref_add(folio, refs);
atomic_add(refs, &folio->_pincount);
} else {
@@ -220,7 +224,7 @@ void folio_add_pin(struct folio *folio)
* page refcount field at least once, so that the page really is
* pinned.
*/
- if (folio_test_large(folio)) {
+ if (folio_has_pincount(folio)) {
WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
folio_ref_inc(folio);
atomic_inc(&folio->_pincount);
@@ -409,6 +413,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
sanity_check_pinned_pages(pages, npages);
for (i = 0; i < npages; i += nr) {
+ if (!pages[i]) {
+ nr = 1;
+ continue;
+ }
folio = gup_folio_next(pages, npages, i, &nr);
gup_put_folio(folio, nr, FOLL_PIN);
}
@@ -556,8 +564,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
*/
if (unlikely((flags & FOLL_LONGTERM) &&
!folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
return NULL;
}
@@ -569,7 +576,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
* is pinned. That's why the refcount from the earlier
* try_get_folio() is left intact.
*/
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
atomic_add(refs, &folio->_pincount);
else
folio_ref_add(folio,
@@ -587,6 +594,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
}
#endif /* CONFIG_HAVE_GUP_FAST */
+/* Common code for can_follow_write_* */
+static inline bool can_follow_write_common(struct page *page,
+ struct vm_area_struct *vma, unsigned int flags)
+{
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags, unsigned long address)
{
@@ -613,6 +647,18 @@ static struct page *no_page_table(struct vm_area_struct *vma,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
+static inline bool can_follow_write_pud(pud_t pud, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* If the pud is writable, we can write to the page. */
+ if (pud_write(pud))
+ return true;
+
+ return can_follow_write_common(page, vma, flags);
+}
+
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
int flags, struct follow_page_context *ctx)
@@ -625,10 +671,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
assert_spin_locked(pud_lockptr(mm, pudp));
- if ((flags & FOLL_WRITE) && !pud_write(pud))
+ if (!pud_present(pud))
return NULL;
- if (!pud_present(pud))
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
return NULL;
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
@@ -677,27 +724,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
if (pmd_write(pmd))
return true;
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;
/* ... and a write-fault isn't required for other reasons. */
@@ -798,27 +825,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
if (pte_write(pte))
return true;
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;
/* ... and a write-fault isn't required for other reasons. */
@@ -838,11 +845,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte_t *ptep, pte;
int ret;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return ERR_PTR(-EINVAL);
-
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
return no_page_table(vma, flags, address);
@@ -1100,10 +1102,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
/* user gate pages are read-only */
if (gup_flags & FOLL_WRITE)
return -EFAULT;
- if (address > TASK_SIZE)
- pgd = pgd_offset_k(address);
- else
- pgd = pgd_offset_gate(mm, address);
+ pgd = pgd_offset(mm, address);
if (pgd_none(*pgd))
return -EFAULT;
p4d = p4d_offset(pgd, address);
@@ -1274,6 +1273,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -1285,9 +1287,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
- /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
- if (is_vm_hugetlb_page(vma))
- return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
@@ -1426,7 +1425,11 @@ static long __get_user_pages(struct mm_struct *mm,
start = untagged_addr_remote(mm, start);
- VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+ VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET));
do {
struct page *page;
@@ -2108,28 +2111,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
- char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
if (unlikely(size == 0))
return 0;
if (!user_write_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_put_user(0, uaddr, out);
- uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_put_user(0, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_put_user(0, (char __user *)cur, out);
out:
user_write_access_end();
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
@@ -2183,26 +2180,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)uaddr, end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0;
- end = PAGE_ALIGN(start + size);
- if (end < start)
- end = 0;
mmap_read_lock(mm);
- do {
- if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
break;
- start = (start + PAGE_SIZE) & PAGE_MASK;
- } while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
@@ -2217,30 +2212,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable);
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
- const char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
volatile char c;
if (unlikely(size == 0))
return 0;
if (!user_read_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_get_user(c, uaddr, out);
- uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_get_user(c, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_get_user(c, (const char __user *)cur, out);
out:
user_read_access_end();
(void)c;
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
@@ -2248,6 +2237,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2260,13 +2250,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2338,7 +2327,7 @@ static unsigned long collect_longterm_unpinnable_folios(
continue;
if (folio_test_hugetlb(folio)) {
- isolate_hugetlb(folio, movable_folio_list);
+ folio_isolate_hugetlb(folio, movable_folio_list);
continue;
}
@@ -2757,7 +2746,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*
* *) ptes can be read atomically by the architecture.
*
- * *) access_ok is sufficient to validate userspace address ranges.
+ * *) valid user addesses are below TASK_MAX_SIZE
*
* The last two assumptions can be relaxed by the addition of helper functions.
*
@@ -3010,11 +2999,6 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
break;
}
- if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
-
folio = try_grab_folio_fast(page, 1, flags);
if (!folio) {
gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
@@ -3178,46 +3162,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
return 1;
}
-static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- int refs;
- struct page *page;
- struct folio *folio;
-
- if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
- return 0;
-
- BUILD_BUG_ON(pgd_devmap(orig));
-
- page = pgd_page(orig);
- refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
-
- folio = try_grab_folio_fast(page, refs, flags);
- if (!folio)
- return 0;
-
- if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!gup_fast_folio_allowed(folio, flags)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- *nr += refs;
- folio_set_referenced(folio);
- return 1;
-}
-
static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3312,12 +3256,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
- if (unlikely(pgd_leaf(pgd))) {
- if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
- pages, nr))
- return;
- } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
- pages, nr))
+ BUILD_BUG_ON(pgd_leaf(pgd));
+ if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
+ pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -3351,8 +3292,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
return 0;
if (gup_flags & FOLL_PIN) {
- seq = raw_read_seqcount(&current->mm->write_protect_seq);
- if (seq & 1)
+ if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
return 0;
}
@@ -3365,7 +3305,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
- * that come from THPs splitting.
+ * that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
@@ -3412,8 +3352,6 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EOVERFLOW;
if (end > TASK_SIZE_MAX)
return -EFAULT;
- if (unlikely(!access_ok((void __user *)start, len)))
- return -EFAULT;
nr_pinned = gup_fast(start, end, gup_flags, pages);
if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
@@ -3655,7 +3593,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
{
unsigned int flags, nr_folios, nr_found;
unsigned int i, pgshift = PAGE_SHIFT;
- pgoff_t start_idx, end_idx, next_idx;
+ pgoff_t start_idx, end_idx;
struct folio *folio = NULL;
struct folio_batch fbatch;
struct hstate *h;
@@ -3705,20 +3643,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
folio = NULL;
}
- next_idx = 0;
for (i = 0; i < nr_found; i++) {
- /*
- * As there can be multiple entries for a
- * given folio in the batch returned by
- * filemap_get_folios_contig(), the below
- * check is to ensure that we pin and return a
- * unique set of folios between start and end.
- */
- if (next_idx &&
- next_idx != folio_index(fbatch.folios[i]))
- continue;
-
- folio = page_folio(&fbatch.folios[i]->page);
+ folio = fbatch.folios[i];
if (try_grab_folio(folio, 1, FOLL_PIN)) {
folio_batch_release(&fbatch);
@@ -3730,7 +3656,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
*offset = offset_in_folio(folio, start);
folios[nr_folios] = folio;
- next_idx = folio_next_index(folio);
if (++nr_folios == max_folios)
break;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 7e0229ae4a5a..feac86196a65 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -248,21 +260,19 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* just report the PFN.
*/
if (is_device_private_entry(entry) &&
- pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ee335d96fc39..d3e66136e41a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
+DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = {
#ifndef CONFIG_SHMEM
&zswpout_attr.attr,
&swpin_attr.attr,
+ &swpin_fallback_attr.attr,
+ &swpin_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
&zswpout_attr.attr,
&swpin_attr.attr,
+ &swpin_fallback_attr.attr,
+ &swpin_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -1176,11 +1182,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
folio_throttle_swaprate(folio, gfp);
/*
- * When a folio is not zeroed during allocation (__GFP_ZERO not used),
- * folio_zero_user() is used to make sure that the page corresponding
- * to the faulting address will be hot in the cache after zeroing.
+ * When a folio is not zeroed during allocation (__GFP_ZERO not used)
+ * or user folios require special handling, folio_zero_user() is used to
+ * make sure that the page corresponding to the faulting address will be
+ * hot in the cache after zeroing.
*/
- if (!alloc_zeroed())
+ if (user_alloc_needs_zeroing())
folio_zero_user(folio, addr);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
@@ -1196,7 +1203,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
{
pmd_t entry;
- entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
@@ -1302,10 +1309,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return;
- entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1368,20 +1372,20 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return __do_huge_pmd_anonymous_page(vmf);
}
-static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
pgtable_t pgtable)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
- spinlock_t *ptl;
- ptl = pmd_lock(mm, pmd);
+ lockdep_assert_held(pmd_lockptr(mm, pmd));
+
if (!pmd_none(*pmd)) {
if (write) {
if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1389,7 +1393,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
@@ -1405,16 +1409,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
- pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
-
-out_unlock:
- spin_unlock(ptl);
- if (pgtable)
- pte_free(mm, pgtable);
+ return 0;
}
/**
@@ -1433,6 +1432,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
+ int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1454,13 +1455,58 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
return VM_FAULT_OOM;
}
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
+
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
+ pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(vma->vm_mm, pgtable);
- insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pgtable_t pgtable = NULL;
+ int error;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(mm, vmf->pmd);
+ if (pmd_none(*vmf->pmd)) {
+ folio_get(folio);
+ folio_add_file_rmap_pmd(folio, &folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
+ }
+ error = insert_pfn_pmd(vma, addr, vmf->pmd,
+ pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot,
+ write, pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(mm, pgtable);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
@@ -1475,19 +1521,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
pgprot_t prot = vma->vm_page_prot;
pud_t entry;
- spinlock_t *ptl;
- ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
- goto out_unlock;
+ return;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- goto out_unlock;
+ return;
}
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
@@ -1501,9 +1545,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
-
-out_unlock:
- spin_unlock(ptl);
}
/**
@@ -1521,6 +1562,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1536,12 +1578,59 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
+ ptl = pud_lock(vma->vm_mm, vmf->pud);
insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
+ spin_unlock(ptl);
+
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
+ * @vmf: Structure describing the fault
+ * @folio: folio to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PUD_MASK;
+ pud_t *pud = vmf->pud;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
+
+ /*
+ * If there is already an entry present we assume the folio is
+ * already mapped, hence no need to take another reference. We
+ * still call insert_pfn_pud() though in case the mapping needs
+ * upgrading to writeable.
+ */
+ if (pud_none(*vmf->pud)) {
+ folio_get(folio);
+ folio_add_file_rmap_pud(folio, &folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+ }
+ insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
+ write);
+ spin_unlock(ptl);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1691,13 +1780,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_folio = page_folio(src_page);
folio_get(src_folio);
- if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
/* Page maybe pinned: split and retry the fault on PTEs. */
folio_put(src_folio);
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false);
return -EAGAIN;
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1919,7 +2008,7 @@ unlock_fallback:
folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -2002,7 +2091,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
spin_unlock(vmf->ptl);
writable = false;
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
@@ -2064,7 +2153,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto out;
if (!folio_trylock(folio))
@@ -2134,12 +2223,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else if (is_huge_zero_pmd(orig_pmd)) {
- zap_deposited_table(tlb->mm, pmd);
+ if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
struct folio *folio = NULL;
@@ -2170,6 +2260,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
+
+ /*
+ * Use flush_needed to indicate whether the PMD entry
+ * is present, instead of checking pmd_present() again.
+ */
+ if (flush_needed && pmd_young(orig_pmd) &&
+ likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
}
spin_unlock(ptl);
@@ -2205,6 +2303,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2243,6 +2351,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
@@ -2551,12 +2661,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
} else {
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
}
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
@@ -2627,12 +2737,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
- /* No support for anonymous PUD pages yet */
- BUG();
+ struct page *page = NULL;
+ struct folio *folio;
+
+ /* No support for anonymous PUD pages or migration yet */
+ VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
+ !pud_present(orig_pud));
+
+ page = pud_page(orig_pud);
+ folio = page_folio(page);
+ folio_remove_rmap_pud(folio, page, vma);
+ add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
}
return 1;
}
@@ -2640,6 +2762,10 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
+ struct folio *folio;
+ struct page *page;
+ pud_t old_pud;
+
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
@@ -2647,7 +2773,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush(vma, haddr, pud);
+ old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+
+ if (!vma_is_dax(vma))
+ return;
+
+ page = pud_page(old_pud);
+ folio = page_folio(page);
+
+ if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+ folio_mark_dirty(folio);
+ if (!folio_test_referenced(folio) && pud_young(old_pud))
+ folio_set_referenced(folio);
+ folio_remove_rmap_pud(folio, page, vma);
+ folio_put(folio);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio),
+ -HPAGE_PUD_NR);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -2747,13 +2888,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_special_huge(vma))
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
folio = pfn_swap_entry_folio(entry);
+ } else if (is_huge_zero_pmd(old_pmd)) {
+ return;
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -2938,28 +3081,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, bool freeze, struct folio *folio)
+ pmd_t *pmd, bool freeze)
{
- VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
- VM_BUG_ON(freeze && !folio);
-
- /*
- * When the caller requests to set up a migration entry, we
- * require a folio to check the PMD against. Otherwise, there
- * is a risk of replacing the wrong folio.
- */
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- if (folio && folio != pmd_folio(*pmd))
- return;
+ is_pmd_migration_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
@@ -2969,20 +3100,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
- split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
+ split_huge_pmd_locked(vma, range.start, pmd, freeze);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct folio *folio)
+ bool freeze)
{
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
if (!pmd)
return;
- __split_huge_pmd(vma, pmd, address, freeze, folio);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -2994,13 +3125,13 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
ALIGN(address, HPAGE_PMD_SIZE)))
- split_huge_pmd_address(vma, address, false, NULL);
+ split_huge_pmd_address(vma, address, false);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
+ unsigned long start,
+ unsigned long end,
+ struct vm_area_struct *next)
{
/* Check if we need to split start first. */
split_huge_pmd_if_needed(vma, start);
@@ -3008,16 +3139,9 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/* Check if we need to split end next. */
split_huge_pmd_if_needed(vma, end);
- /*
- * If we're also updating the next vma vm_start,
- * check if we need to split it.
- */
- if (adjust_next > 0) {
- struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
- unsigned long nstart = next->vm_start;
- nstart += adjust_next;
- split_huge_pmd_if_needed(next, nstart);
- }
+ /* If we're incrementing next->vm_start, we might need to split it. */
+ if (next)
+ split_huge_pmd_if_needed(next, end);
}
static void unmap_folio(struct folio *folio)
@@ -3051,8 +3175,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
int ref_count, map_count;
pmd_t orig_pmd = *pmdp;
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
return false;
+ }
orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
@@ -3079,8 +3207,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
*
* The only folio refs must be one from isolation plus the rmap(s).
*/
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
- ref_count != map_count + 1) {
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
+ set_pmd_at(mm, addr, pmdp, orig_pmd);
+ return false;
+ }
+
+ if (ref_count != map_count + 1) {
set_pmd_at(mm, addr, pmdp, orig_pmd);
return false;
}
@@ -3100,12 +3235,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
{
VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
- if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
- return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
-
- return false;
+ return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
}
static void remap_page(struct folio *folio, unsigned long nr, int flags)
@@ -3124,225 +3258,378 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags)
}
}
-static void lru_add_page_tail(struct folio *folio, struct page *tail,
+static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
struct lruvec *lruvec, struct list_head *list)
{
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- VM_BUG_ON_FOLIO(PageLRU(tail), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
- get_page(tail);
- list_add_tail(&tail->lru, list);
+ folio_get(new_folio);
+ list_add_tail(&new_folio->lru, list);
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!folio_test_lru(folio));
if (folio_test_unevictable(folio))
- tail->mlock_count = 0;
+ new_folio->mlock_count = 0;
else
- list_add_tail(&tail->lru, &folio->lru);
- SetPageLRU(tail);
+ list_add_tail(&new_folio->lru, &folio->lru);
+ folio_set_lru(new_folio);
}
}
-static void __split_huge_page_tail(struct folio *folio, int tail,
- struct lruvec *lruvec, struct list_head *list,
- unsigned int new_order)
+/* Racy check whether the huge page can be split */
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
- struct page *head = &folio->page;
- struct page *page_tail = head + tail;
- /*
- * Careful: new_folio is not a "real" folio before we cleared PageTail.
- * Don't pass it around before clear_compound_head().
- */
- struct folio *new_folio = (struct folio *)page_tail;
+ int extra_pins;
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+ /* Additional pins from page cache */
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
+ else
+ extra_pins = folio_nr_pages(folio);
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
+ caller_pins;
+}
+
+/*
+ * It splits @folio into @new_order folios and copies the @folio metadata to
+ * all the resulting folios.
+ */
+static void __split_folio_to_order(struct folio *folio, int old_order,
+ int new_order)
+{
+ long new_nr_pages = 1 << new_order;
+ long nr_pages = 1 << old_order;
+ long i;
/*
- * Clone page flags before unfreezing refcount.
- *
- * After successful get_page_unless_zero() might follow flags change,
- * for example lock_page() which set PG_waiters.
- *
- * Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
- * the migration entry instead from where remap_page() will restore it.
- * We can still have PG_anon_exclusive set on effectively unmapped and
- * unreferenced sub-pages of an anonymous THP: we can simply drop
- * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ * Skip the first new_nr_pages, since the new folio from them have all
+ * the flags from the original folio.
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (head->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_swapcache) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_workingset) |
- (1L << PG_locked) |
- (1L << PG_unevictable) |
+ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
+ struct page *new_head = &folio->page + i;
+
+ /*
+ * Careful: new_folio is not a "real" folio before we cleared PageTail.
+ * Don't pass it around before clear_compound_head().
+ */
+ struct folio *new_folio = (struct folio *)new_head;
+
+ VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
+
+ /*
+ * Clone page flags before unfreezing refcount.
+ *
+ * After successful get_page_unless_zero() might follow flags change,
+ * for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ */
+ new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags |= (folio->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_workingset) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
- (1L << PG_arch_2) |
+ (1L << PG_arch_2) |
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
- (1L << PG_arch_3) |
+ (1L << PG_arch_3) |
#endif
- (1L << PG_dirty) |
- LRU_GEN_MASK | LRU_REFS_MASK));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first and second tail page is replaced by other uses */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
- new_folio->mapping = folio->mapping;
- new_folio->index = folio->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + i;
- /*
- * page->private should not be set in tail pages. Fix up and warn once
- * if private is unexpectedly set.
- */
- if (unlikely(page_tail->private)) {
- VM_WARN_ON_ONCE_PAGE(true, page_tail);
- page_tail->private = 0;
- }
- if (folio_test_swapcache(folio))
- new_folio->swap.val = folio->swap.val + tail;
+ /*
+ * page->private should not be set in tail pages. Fix up and warn once
+ * if private is unexpectedly set.
+ */
+ if (unlikely(new_folio->private)) {
+ VM_WARN_ON_ONCE_PAGE(true, new_head);
+ new_folio->private = NULL;
+ }
- /* Page flags must be visible before we make the page non-compound. */
- smp_wmb();
+ if (folio_test_swapcache(folio))
+ new_folio->swap.val = folio->swap.val + i;
- /*
- * Clear PageTail before unfreezing page refcount.
- *
- * After successful get_page_unless_zero() might follow put_page()
- * which needs correct compound_head().
- */
- clear_compound_head(page_tail);
- if (new_order) {
- prep_compound_page(page_tail, new_order);
- folio_set_large_rmappable(new_folio);
- }
+ /* Page flags must be visible before we make the page non-compound. */
+ smp_wmb();
- /* Finally unfreeze refcount. Additional reference from page cache. */
- page_ref_unfreeze(page_tail,
- 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
- folio_nr_pages(new_folio) : 0));
+ /*
+ * Clear PageTail before unfreezing page refcount.
+ *
+ * After successful get_page_unless_zero() might follow put_page()
+ * which needs correct compound_head().
+ */
+ clear_compound_head(new_head);
+ if (new_order) {
+ prep_compound_page(new_head, new_order);
+ folio_set_large_rmappable(new_folio);
+ }
- if (folio_test_young(folio))
- folio_set_young(new_folio);
- if (folio_test_idle(folio))
- folio_set_idle(new_folio);
+ if (folio_test_young(folio))
+ folio_set_young(new_folio);
+ if (folio_test_idle(folio))
+ folio_set_idle(new_folio);
+#ifdef CONFIG_MEMCG
+ new_folio->memcg_data = folio->memcg_data;
+#endif
- folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ }
- /*
- * always add to the tail because some iterators expect new
- * pages to show after the currently processed elements - e.g.
- * migrate_pages
- */
- lru_add_page_tail(folio, page_tail, lruvec, list);
+ if (new_order)
+ folio_set_order(folio, new_order);
+ else
+ ClearPageCompound(&folio->page);
}
-static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned int new_order)
+/*
+ * It splits an unmapped @folio to lower order smaller folios in two ways.
+ * @folio: the to-be-split folio
+ * @new_order: the smallest order of the after split folios (since buddy
+ * allocator like split generates folios with orders from @folio's
+ * order - 1 to new_order).
+ * @split_at: in buddy allocator like split, the folio containing @split_at
+ * will be split until its order becomes @new_order.
+ * @lock_at: the folio containing @lock_at is left locked for caller.
+ * @list: the after split folios will be added to @list if it is not NULL,
+ * otherwise to LRU lists.
+ * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
+ * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
+ * @mapping: @folio->mapping
+ * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ *
+ *
+ * 1. uniform split: the given @folio into multiple @new_order small folios,
+ * where all small folios have the same order. This is done when
+ * uniform_split is true.
+ * 2. buddy allocator like (non-uniform) split: the given @folio is split into
+ * half and one of the half (containing the given page) is split into half
+ * until the given @page's order becomes @new_order. This is done when
+ * uniform_split is false.
+ *
+ * The high level flow for these two methods are:
+ * 1. uniform split: a single __split_folio_to_order() is called to split the
+ * @folio into @new_order, then we traverse all the resulting folios one by
+ * one in PFN ascending order and perform stats, unfreeze, adding to list,
+ * and file mapping index operations.
+ * 2. non-uniform split: in general, folio_order - @new_order calls to
+ * __split_folio_to_order() are made in a for loop to split the @folio
+ * to one lower order at a time. The resulting small folios are processed
+ * like what is done during the traversal in 1, except the one containing
+ * @page, which is split in next for loop.
+ *
+ * After splitting, the caller's folio reference will be transferred to the
+ * folio containing @page. The other folios may be freed if they are not mapped.
+ *
+ * In terms of locking, after splitting,
+ * 1. uniform split leaves @page (or the folio contains it) locked;
+ * 2. buddy allocator like (non-uniform) split leaves @folio locked.
+ *
+ *
+ * For !uniform_split, when -ENOMEM is returned, the original folio might be
+ * split. The caller needs to check the input folio.
+ */
+static int __split_unmapped_folio(struct folio *folio, int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, pgoff_t end,
+ struct xa_state *xas, struct address_space *mapping,
+ bool uniform_split)
{
- struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
- unsigned long offset = 0;
- int i, nr_dropped = 0;
- unsigned int new_nr = 1 << new_order;
+ struct folio *origin_folio = folio;
+ struct folio *next_folio = folio_next(folio);
+ struct folio *new_folio;
+ struct folio *next;
int order = folio_order(folio);
- unsigned int nr = 1 << order;
+ int split_order;
+ int start_order = uniform_split ? new_order : order - 1;
+ int nr_dropped = 0;
+ int ret = 0;
+ bool stop_split = false;
- /* complete memcg works before add pages to LRU */
- split_page_memcg(head, order, new_order);
+ if (folio_test_swapcache(folio)) {
+ VM_BUG_ON(mapping);
+
+ /* a swapcache folio can only be uniformly split to order-0 */
+ if (!uniform_split || new_order != 0)
+ return -EINVAL;
- if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
- offset = swap_cache_index(folio->swap);
swap_cache = swap_address_space(folio->swap);
xa_lock(&swap_cache->i_pages);
}
+ if (folio_test_anon(folio))
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = folio_lruvec_lock(folio);
- ClearPageHasHWPoisoned(head);
-
- for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
- struct folio *tail;
- __split_huge_page_tail(folio, i, lruvec, list, new_order);
- tail = page_folio(head + i);
- /* Some pages can be beyond EOF: drop them from page cache */
- if (tail->index >= end) {
- if (shmem_mapping(folio->mapping))
- nr_dropped++;
- else if (folio_test_clear_dirty(tail))
- folio_account_cleaned(tail,
- inode_to_wb(folio->mapping->host));
- __filemap_remove_folio(tail, NULL);
- folio_put(tail);
- } else if (!folio_test_anon(folio)) {
- __xa_store(&folio->mapping->i_pages, tail->index,
- tail, 0);
- } else if (swap_cache) {
- __xa_store(&swap_cache->i_pages, offset + i,
- tail, 0);
+ folio_clear_has_hwpoisoned(folio);
+
+ /*
+ * split to new_order one order at a time. For uniform split,
+ * folio is split to new_order directly.
+ */
+ for (split_order = start_order;
+ split_order >= new_order && !stop_split;
+ split_order--) {
+ int old_order = folio_order(folio);
+ struct folio *release;
+ struct folio *end_folio = folio_next(folio);
+
+ /* order-1 anonymous folio is not supported */
+ if (folio_test_anon(folio) && split_order == 1)
+ continue;
+ if (uniform_split && split_order != new_order)
+ continue;
+
+ if (mapping) {
+ /*
+ * uniform split has xas_split_alloc() called before
+ * irq is disabled to allocate enough memory, whereas
+ * non-uniform split can handle ENOMEM.
+ */
+ if (uniform_split)
+ xas_split(xas, folio, old_order);
+ else {
+ xas_set_order(xas, folio->index, split_order);
+ xas_try_split(xas, folio, old_order);
+ if (xas_error(xas)) {
+ ret = xas_error(xas);
+ stop_split = true;
+ goto after_split;
+ }
+ }
}
- }
- if (!new_order)
- ClearPageCompound(head);
- else {
- struct folio *new_folio = (struct folio *)head;
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
- folio_set_order(new_folio, new_order);
- }
- unlock_page_lruvec(lruvec);
- /* Caller disabled irqs, so they are still disabled here */
+ __split_folio_to_order(folio, old_order, split_order);
- split_page_owner(head, order, new_order);
- pgalloc_tag_split(folio, order, new_order);
+after_split:
+ /*
+ * Iterate through after-split folios and perform related
+ * operations. But in buddy allocator like split, the folio
+ * containing the specified page is skipped until its order
+ * is new_order, since the folio will be worked on in next
+ * iteration.
+ */
+ for (release = folio; release != end_folio; release = next) {
+ next = folio_next(release);
+ /*
+ * for buddy allocator like split, the folio containing
+ * page will be split next and should not be released,
+ * until the folio's order is new_order or stop_split
+ * is set to true by the above xas_split() failure.
+ */
+ if (release == page_folio(split_at)) {
+ folio = release;
+ if (split_order != new_order && !stop_split)
+ continue;
+ }
+ if (folio_test_anon(release)) {
+ mod_mthp_stat(folio_order(release),
+ MTHP_STAT_NR_ANON, 1);
+ }
- /* See comment in __split_huge_page_tail() */
- if (folio_test_anon(folio)) {
- /* Additional pin to swap cache */
- if (folio_test_swapcache(folio)) {
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&swap_cache->i_pages);
- } else {
- folio_ref_inc(folio);
+ /*
+ * origin_folio should be kept frozon until page cache
+ * entries are updated with all the other after-split
+ * folios to prevent others seeing stale page cache
+ * entries.
+ */
+ if (release == origin_folio)
+ continue;
+
+ folio_ref_unfreeze(release, 1 +
+ ((mapping || swap_cache) ?
+ folio_nr_pages(release) : 0));
+
+ lru_add_split_folio(origin_folio, release, lruvec,
+ list);
+
+ /* Some pages can be beyond EOF: drop them from cache */
+ if (release->index >= end) {
+ if (shmem_mapping(mapping))
+ nr_dropped += folio_nr_pages(release);
+ else if (folio_test_clear_dirty(release))
+ folio_account_cleaned(release,
+ inode_to_wb(mapping->host));
+ __filemap_remove_folio(release, NULL);
+ folio_put_refs(release, folio_nr_pages(release));
+ } else if (mapping) {
+ __xa_store(&mapping->i_pages,
+ release->index, release, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages,
+ swap_cache_index(release->swap),
+ release, 0);
+ }
}
- } else {
- /* Additional pin to page cache */
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&folio->mapping->i_pages);
}
+
+ /*
+ * Unfreeze origin_folio only after all page cache entries, which used
+ * to point to it, have been updated with new folios. Otherwise,
+ * a parallel folio_try_get() can grab origin_folio and its caller can
+ * see stale page cache entries.
+ */
+ folio_ref_unfreeze(origin_folio, 1 +
+ ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0));
+
+ unlock_page_lruvec(lruvec);
+
+ if (swap_cache)
+ xa_unlock(&swap_cache->i_pages);
+ if (mapping)
+ xa_unlock(&mapping->i_pages);
+
+ /* Caller disabled irqs, so they are still disabled here */
local_irq_enable();
if (nr_dropped)
- shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
+ shmem_uncharge(mapping->host, nr_dropped);
+
+ remap_page(origin_folio, 1 << order,
+ folio_test_anon(origin_folio) ?
+ RMP_USE_SHARED_ZEROPAGE : 0);
/*
- * set page to its compound_head when split to non order-0 pages, so
- * we can skip unlocking it below, since PG_locked is transferred to
- * the compound_head of the page and the caller will unlock it.
+ * At this point, folio should contain the specified page.
+ * For uniform split, it is left for caller to unlock.
+ * For buddy allocator like split, the first after-split folio is left
+ * for caller to unlock.
*/
- if (new_order)
- page = compound_head(page);
-
- for (i = 0; i < nr; i += new_nr) {
- struct page *subpage = head + i;
- struct folio *new_folio = page_folio(subpage);
- if (subpage == page)
+ for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ if (new_folio == page_folio(lock_at))
continue;
- folio_unlock(new_folio);
+ folio_unlock(new_folio);
/*
* Subpages may be freed if there wasn't any mapping
* like if add_to_swap() is running on a lru page that
@@ -3350,81 +3637,90 @@ static void __split_huge_page(struct page *page, struct list_head *list,
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
- free_page_and_swap_cache(subpage);
+ free_folio_and_swap_cache(new_folio);
}
+ return ret;
}
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
+bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
{
- int extra_pins;
+ if (folio_test_anon(folio)) {
+ /* order-1 is not supported for anonymous THP. */
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ return new_order != 1;
+ } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ /*
+ * No split if the file system does not support large folio.
+ * Note that we might still have THPs in such mappings due to
+ * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
+ * does not actually support large folios properly.
+ */
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
- /* Additional pins from page cache */
- if (folio_test_anon(folio))
- extra_pins = folio_test_swapcache(folio) ?
- folio_nr_pages(folio) : 0;
- else
- extra_pins = folio_nr_pages(folio);
- if (pextra_pins)
- *pextra_pins = extra_pins;
- return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
- caller_pins;
+ /* Only swapping a whole PMD-mapped folio is supported */
+ if (folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
+}
+
+/* See comments in non_uniform_split_supported() */
+bool uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
+{
+ if (folio_test_anon(folio)) {
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ return new_order != 1;
+ } else if (new_order) {
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
+ }
+
+ if (new_order && folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
}
/*
- * This function splits a large folio into smaller folios of order @new_order.
- * @page can point to any page of the large folio to split. The split operation
- * does not change the position of @page.
+ * __folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ * @lock_at: a page within @folio to be left locked to caller
+ * @list: after-split folios will be put on it if non NULL
+ * @uniform_split: perform uniform split or not (non-uniform split)
*
- * Prerequisites:
- *
- * 1) The caller must hold a reference on the @page's owning folio, also known
- * as the large folio.
+ * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
+ * It is in charge of checking whether the split is supported or not and
+ * preparing @folio for __split_unmapped_folio().
*
- * 2) The large folio must be locked.
- *
- * 3) The folio must not be pinned. Any unexpected folio references, including
- * GUP pins, will result in the folio not getting split; instead, the caller
- * will receive an -EAGAIN.
- *
- * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
- * supported for non-file-backed folios, because folio->_deferred_list, which
- * is used by partially mapped folios, is stored in subpage 2, but an order-1
- * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
- * since they do not use _deferred_list.
- *
- * After splitting, the caller's folio reference will be transferred to @page,
- * resulting in a raised refcount of @page after this call. The other pages may
- * be freed if they are not mapped.
- *
- * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
- *
- * Pages in @new_order will inherit the mapping, flags, and so on from the
- * huge page.
- *
- * Returns 0 if the huge page was split successfully.
- *
- * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
- * the folio was concurrently removed from the page cache.
- *
- * Returns -EBUSY when trying to split the huge zeropage, if the folio is
- * under writeback, if fs-specific folio metadata cannot currently be
- * released, or if some unexpected race happened (e.g., anon VMA disappeared,
- * truncation).
- *
- * Callers should ensure that the order respects the address space mapping
- * min-order if one is set for non-anonymous folios.
- *
- * Returns -EINVAL when trying to split to an order that is incompatible
- * with the folio. Splitting to order 0 is compatible with all folios.
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
- unsigned int new_order)
+static int __folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, bool uniform_split)
{
- struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
- /* reset xarray order to new order after split */
- XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
@@ -3436,38 +3732,17 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (folio != page_folio(split_at) || folio != page_folio(lock_at))
+ return -EINVAL;
+
if (new_order >= folio_order(folio))
return -EINVAL;
- if (is_anon) {
- /* order-1 is not supported for anonymous THP. */
- if (new_order == 1) {
- VM_WARN_ONCE(1, "Cannot split to order-1 folio");
- return -EINVAL;
- }
- } else if (new_order) {
- /* Split shmem folio to non-zero order not supported */
- if (shmem_mapping(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split shmem folio to non-0 order");
- return -EINVAL;
- }
- /*
- * No split if the file system does not support large folio.
- * Note that we might still have THPs in such mappings due to
- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
- * does not actually support large folios properly.
- */
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split file folio to non-0 order");
- return -EINVAL;
- }
- }
+ if (uniform_split && !uniform_split_supported(folio, new_order, true))
+ return -EINVAL;
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio) && new_order)
+ if (!uniform_split &&
+ !non_uniform_split_supported(folio, new_order, true))
return -EINVAL;
is_hzp = is_huge_zero_folio(folio);
@@ -3503,6 +3778,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
mapping = folio->mapping;
/* Truncated ? */
+ /*
+ * TODO: add support for large shmem folio in swap cache.
+ * When shmem is in swap cache, mapping is NULL and
+ * folio_test_swapcache() is true.
+ */
if (!mapping) {
ret = -EBUSY;
goto out;
@@ -3524,21 +3804,24 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
goto out;
}
- xas_split_alloc(&xas, folio, folio_order(folio), gfp);
- if (xas_error(&xas)) {
- ret = xas_error(&xas);
- goto out;
+ if (uniform_split) {
+ xas_set_order(&xas, folio->index, new_order);
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
}
anon_vma = NULL;
i_mmap_lock_read(mapping);
/*
- *__split_huge_page() may need to trim off pages beyond EOF:
- * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
- * which cannot be nested inside the page tree lock. So note
- * end now: i_size itself may be changed at any moment, but
- * folio lock is good enough to serialize the trimming.
+ *__split_unmapped_folio() may need to trim off pages beyond
+ * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
+ * seqlock, which cannot be nested inside the page tree lock.
+ * So note end now: i_size itself may be changed at any moment,
+ * but folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -3576,7 +3859,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio)) {
- __folio_clear_partially_mapped(folio);
+ folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
@@ -3592,7 +3875,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (mapping) {
int nr = folio_nr_pages(folio);
- xas_split(&xas, folio, folio_order(folio));
if (folio_test_pmd_mappable(folio) &&
new_order < HPAGE_PMD_ORDER) {
if (folio_test_swapbacked(folio)) {
@@ -3606,12 +3888,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
}
}
- if (is_anon) {
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
- }
- __split_huge_page(page, list, end, new_order);
- ret = 0;
+ ret = __split_unmapped_folio(folio, new_order,
+ split_at, lock_at, list, end, &xas, mapping,
+ uniform_split);
} else {
spin_unlock(&ds_queue->split_queue_lock);
fail:
@@ -3637,6 +3916,90 @@ out:
return ret;
}
+/*
+ * This function splits a large folio into smaller folios of order @new_order.
+ * @page can point to any page of the large folio to split. The split operation
+ * does not change the position of @page.
+ *
+ * Prerequisites:
+ *
+ * 1) The caller must hold a reference on the @page's owning folio, also known
+ * as the large folio.
+ *
+ * 2) The large folio must be locked.
+ *
+ * 3) The folio must not be pinned. Any unexpected folio references, including
+ * GUP pins, will result in the folio not getting split; instead, the caller
+ * will receive an -EAGAIN.
+ *
+ * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
+ * supported for non-file-backed folios, because folio->_deferred_list, which
+ * is used by partially mapped folios, is stored in subpage 2, but an order-1
+ * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
+ * since they do not use _deferred_list.
+ *
+ * After splitting, the caller's folio reference will be transferred to @page,
+ * resulting in a raised refcount of @page after this call. The other pages may
+ * be freed if they are not mapped.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Pages in @new_order will inherit the mapping, flags, and so on from the
+ * huge page.
+ *
+ * Returns 0 if the huge page was split successfully.
+ *
+ * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
+ * the folio was concurrently removed from the page cache.
+ *
+ * Returns -EBUSY when trying to split the huge zeropage, if the folio is
+ * under writeback, if fs-specific folio metadata cannot currently be
+ * released, or if some unexpected race happened (e.g., anon VMA disappeared,
+ * truncation).
+ *
+ * Callers should ensure that the order respects the address space mapping
+ * min-order if one is set for non-anonymous folios.
+ *
+ * Returns -EINVAL when trying to split to an order that is incompatible
+ * with the folio. Splitting to order 0 is compatible with all folios.
+ */
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
+{
+ struct folio *folio = page_folio(page);
+
+ return __folio_split(folio, new_order, &folio->page, page, list, true);
+}
+
+/*
+ * folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ *
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
+ *
+ * It has the same prerequisites and returns as
+ * split_huge_page_to_list_to_order().
+ *
+ * Split a folio at @split_at to a new_order folio, leave the
+ * remaining subpages of the original folio as large as possible. For example,
+ * in the case of splitting an order-9 folio at its third order-3 subpages to
+ * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
+ * After the split, there will be a group of folios with different orders and
+ * the new folio containing @split_at is marked in bracket:
+ * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
+ *
+ * After split, folio is left locked for caller.
+ */
+int folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct list_head *list)
+{
+ return __folio_split(folio, new_order, split_at, &folio->page, list,
+ false);
+}
+
int min_order_for_split(struct folio *folio)
{
if (folio_test_anon(folio))
@@ -3688,7 +4051,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio)) {
- __folio_clear_partially_mapped(folio);
+ folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
@@ -3721,7 +4084,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
/*
* Exclude swapcache: originally to avoid a corrupt deferred split
- * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * queue. Nowadays that is fully prevented by memcg1_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
@@ -3732,7 +4095,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
- __folio_set_partially_mapped(folio);
+ folio_set_partially_mapped(folio);
if (folio_test_pmd_mappable(folio))
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
@@ -3825,7 +4188,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with folio_put() */
if (folio_test_partially_mapped(folio)) {
- __folio_clear_partially_mapped(folio);
+ folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
@@ -3956,7 +4319,8 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
}
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
- unsigned long vaddr_end, unsigned int new_order)
+ unsigned long vaddr_end, unsigned int new_order,
+ long in_folio_offset)
{
int ret = 0;
struct task_struct *task;
@@ -4040,8 +4404,16 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!folio_test_anon(folio) && folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 ||
+ in_folio_offset >= folio_nr_pages(folio)) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
@@ -4064,7 +4436,8 @@ out:
}
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
- pgoff_t off_end, unsigned int new_order)
+ pgoff_t off_end, unsigned int new_order,
+ long in_folio_offset)
{
struct filename *file;
struct file *candidate;
@@ -4113,8 +4486,15 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
folio_unlock(folio);
@@ -4147,6 +4527,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
int pid;
unsigned long vaddr_start, vaddr_end;
unsigned int new_order = 0;
+ long in_folio_offset = -1;
ret = mutex_lock_interruptible(&split_debug_mutex);
if (ret)
@@ -4162,42 +4543,46 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
if (input_buf[0] == '/') {
char *tok;
- char *buf = input_buf;
+ char *tok_buf = input_buf;
char file_path[MAX_INPUT_BUF_SZ];
pgoff_t off_start = 0, off_end = 0;
size_t input_len = strlen(input_buf);
- tok = strsep(&buf, ",");
- if (tok) {
+ tok = strsep(&tok_buf, ",");
+ if (tok && tok_buf) {
strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
}
- ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
- if (ret != 2 && ret != 3) {
+ ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
+ &new_order, &in_folio_offset);
+ if (ret != 2 && ret != 3 && ret != 4) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
+ ret = split_huge_pages_in_file(file_path, off_start, off_end,
+ new_order, in_folio_offset);
if (!ret)
ret = input_len;
goto out;
}
- ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
+ &vaddr_end, &new_order, &in_folio_offset);
if (ret == 1 && pid == 1) {
split_huge_pages_all();
ret = strlen(input_buf);
goto out;
- } else if (ret != 3 && ret != 4) {
+ } else if (ret != 3 && ret != 4 && ret != 5) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
+ in_folio_offset);
if (!ret)
ret = strlen(input_buf);
out:
@@ -4286,7 +4671,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
- pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+ pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ea2ed8e301ef..9dc95eac558c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,9 +14,11 @@
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
+#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
+#include <linux/minmax.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
@@ -40,6 +42,7 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/setup.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
@@ -48,18 +51,34 @@
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
+#include "hugetlb_cma.h"
+#include <linux/page-isolation.h>
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
-#ifdef CONFIG_CMA
-static struct cma *hugetlb_cma[MAX_NUMNODES];
-static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-#endif
-static unsigned long hugetlb_cma_size __initdata;
-
+__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
+static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
+
+/*
+ * Due to ordering constraints across the init code for various
+ * architectures, hugetlb hstate cmdline parameters can't simply
+ * be early_param. early_param might call the setup function
+ * before valid hugetlb page sizes are determined, leading to
+ * incorrect rejection of valid hugepagesz= options.
+ *
+ * So, record the parameters early and consume them whenever the
+ * init code is ready for them, by calling hugetlb_parse_params().
+ */
+
+/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
+#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1)
+struct hugetlb_cmdline {
+ char *val;
+ int (*setup)(char *val);
+};
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
@@ -67,6 +86,21 @@ static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
+static unsigned long hugepage_allocation_threads __initdata;
+
+static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
+static int hstate_cmdline_index __initdata;
+static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
+static int hugetlb_param_index __initdata;
+static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
+static __init void hugetlb_parse_params(void);
+
+#define hugetlb_early_param(str, func) \
+static __init int func##args(char *s) \
+{ \
+ return hugetlb_add_param(s, func); \
+} \
+early_param(str, func##args)
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -87,17 +121,16 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
{
-#ifdef CONFIG_CMA
- int nid = folio_nid(folio);
-
- if (cma_free_folio(hugetlb_cma[nid], folio))
+ if (folio_test_hugetlb_cma(folio)) {
+ hugetlb_cma_free_folio(folio);
return;
-#endif
+ }
+
folio_put(folio);
}
@@ -1218,7 +1251,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -1246,69 +1279,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
hugetlb_dup_vma_private(vma);
}
-/* Returns true if the VMA has associated reserve pages */
-static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
-{
- if (vma->vm_flags & VM_NORESERVE) {
- /*
- * This address is already reserved by other process(chg == 0),
- * so, we should decrement reserved count. Without decrementing,
- * reserve count remains after releasing inode, because this
- * allocated page will go into page cache and is regarded as
- * coming from reserved pool in releasing step. Currently, we
- * don't have any other solution to deal with this situation
- * properly, so add work-around here.
- */
- if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return true;
- else
- return false;
- }
-
- /* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE) {
- /*
- * We know VM_NORESERVE is not set. Therefore, there SHOULD
- * be a region map for all pages. The only situation where
- * there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reserves to
- * use. This situation is indicated if chg != 0.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Like the shared case above, a hole punch or truncate
- * could have been performed on the private mapping.
- * Examine the value of chg to determine if reserves
- * actually exist or were previously consumed.
- * Very Subtle - The value of chg comes from a previous
- * call to vma_needs_reserves(). The reserve map for
- * private mappings has different (opposite) semantics
- * than that of shared mappings. vma_needs_reserves()
- * has already taken this difference in semantics into
- * account. Therefore, the meaning of chg is the same
- * as in the shared case above. Code could easily be
- * combined, but keeping it separate draws attention to
- * subtle differences.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- return false;
-}
-
static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
int nid = folio_nid(folio);
@@ -1336,6 +1306,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
if (folio_test_hwpoison(folio))
continue;
+ if (is_migrate_isolate_page(&folio->page))
+ continue;
+
list_move(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
folio_clear_hugetlb_freed(folio);
@@ -1394,8 +1367,7 @@ static unsigned long available_huge_pages(struct hstate *h)
static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve,
- long chg)
+ unsigned long address, long gbl_chg)
{
struct folio *folio = NULL;
struct mempolicy *mpol;
@@ -1404,15 +1376,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
int nid;
/*
- * A child process with MAP_PRIVATE mappings created by their parent
- * have no page reserves. This check ensures that reservations are
- * not "stolen". The child may still get SIGKILLed
+ * gbl_chg==1 means the allocation requires a new page that was not
+ * reserved before. Making sure there's at least one free page.
*/
- if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
- goto err;
-
- /* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && !available_huge_pages(h))
+ if (gbl_chg && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1430,11 +1397,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
- if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
-
mpol_cond_put(mpol);
return folio;
@@ -1525,27 +1487,11 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
retry:
- folio = NULL;
-#ifdef CONFIG_CMA
- {
- int node;
-
- if (hugetlb_cma[nid])
- folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
-
- if (!folio && !(gfp_mask & __GFP_THISNODE)) {
- for_each_node_mask(node, *nodemask) {
- if (node == nid || !hugetlb_cma[node])
- continue;
-
- folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
- if (folio)
- break;
- }
- }
- }
-#endif
+ folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
if (!folio) {
+ if (hugetlb_cma_exclusive_alloc())
+ return NULL;
+
folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
if (!folio)
return NULL;
@@ -1704,7 +1650,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
folio_ref_unfreeze(folio, 1);
- INIT_LIST_HEAD(&folio->_deferred_list);
hugetlb_free_folio(folio);
}
@@ -2006,7 +1951,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;
- bool retry = true;
/*
* By default we always try hard to allocate the folio with
@@ -2021,22 +1965,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
-retry:
- folio = __folio_alloc(gfp_mask, order, nid, nmask);
- /* Ensure hugetlb folio won't have large_rmappable flag set. */
- if (folio)
- folio_clear_large_rmappable(folio);
- if (folio && !folio_ref_freeze(folio, 1)) {
- folio_put(folio);
- if (retry) { /* retry once */
- retry = false;
- goto retry;
- }
- /* WOW! twice in a row. */
- pr_warn("HugeTLB unexpected inflated folio ref count\n");
- folio = NULL;
- }
+ folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
@@ -2205,6 +2135,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2227,7 +2159,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2247,7 +2181,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
@@ -2311,12 +2245,21 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
return NULL;
+ hugetlb_vmemmap_optimize_folio(h, folio);
+
spin_lock_irq(&hugetlb_lock);
/*
+ * nr_huge_pages needs to be adjusted within the same lock cycle
+ * as surplus_pages, otherwise it might confuse
+ * persistent_huge_pages() momentarily.
+ */
+ __prep_account_new_huge_page(h, folio_nid(folio));
+
+ /*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
@@ -2462,8 +2405,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long i;
long needed, allocated;
bool alloc_ok = true;
- int node;
- nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ nodemask_t *mbind_nodemask, alloc_nodemask;
+
+ mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ if (mbind_nodemask)
+ nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
+ else
+ alloc_nodemask = cpuset_current_mems_allowed;
lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2479,14 +2427,13 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
folio = NULL;
- for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- node, NULL);
- if (folio)
- break;
- }
- }
+
+ /*
+ * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
+ * down the road to pick the current node if that is the case.
+ */
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, &alloc_nodemask);
if (!folio) {
alloc_ok = false;
break;
@@ -2840,20 +2787,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
* the old one
- * @h: struct hstate old page belongs to
* @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
- struct folio *old_folio, struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
+ struct list_head *list)
{
- gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ gfp_t gfp_mask;
+ struct hstate *h;
int nid = folio_nid(old_folio);
struct folio *new_folio = NULL;
int ret = 0;
retry:
+ /*
+ * The old_folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
/*
@@ -2868,7 +2819,7 @@ retry:
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- isolated = isolate_hugetlb(old_folio, list);
+ isolated = folio_isolate_hugetlb(old_folio, list);
ret = isolated ? 0 : -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
@@ -2882,8 +2833,10 @@ retry:
cond_resched();
goto retry;
} else {
+ h = folio_hstate(old_folio);
if (!new_folio) {
spin_unlock_irq(&hugetlb_lock);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
NULL, NULL);
if (!new_folio)
@@ -2925,105 +2878,154 @@ free_new:
return ret;
}
-int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
- struct hstate *h;
- struct folio *folio = page_folio(page);
int ret = -EBUSY;
- /*
- * The page might have been dissolved from under our feet, so make sure
- * to carefully check the state under the lock.
- * Return success when racing as if we dissolved the page ourselves.
- */
- spin_lock_irq(&hugetlb_lock);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- spin_unlock_irq(&hugetlb_lock);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!folio_test_hugetlb(folio))
return 0;
- }
- spin_unlock_irq(&hugetlb_lock);
/*
* Fence off gigantic pages as there is a cyclic dependency between
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
- if (hstate_is_gigantic(h))
+ if (folio_order(folio) > MAX_PAGE_ORDER)
return -ENOMEM;
- if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
+ if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
- ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
+ ret = alloc_and_dissolve_hugetlb_folio(folio, list);
return ret;
}
+/*
+ * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
+ * range with new folios.
+ * @start_pfn: start pfn of the given pfn range
+ * @end_pfn: end pfn of the given pfn range
+ * Returns 0 on success, otherwise negated error.
+ */
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct folio *folio;
+ int ret = 0;
+
+ LIST_HEAD(isolate_list);
+
+ while (start_pfn < end_pfn) {
+ folio = pfn_folio(start_pfn);
+
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
+ if (ret)
+ break;
+
+ putback_movable_pages(&isolate_list);
+ }
+ start_pfn++;
+ }
+
+ return ret;
+}
+
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
+typedef enum {
+ /*
+ * For either 0/1: we checked the per-vma resv map, and one resv
+ * count either can be reused (0), or an extra needed (1).
+ */
+ MAP_CHG_REUSE = 0,
+ MAP_CHG_NEEDED = 1,
+ /*
+ * Cannot use per-vma resv count can be used, hence a new resv
+ * count is enforced.
+ *
+ * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
+ * that currently vma_needs_reservation() has an unwanted side
+ * effect to either use end() or commit() to complete the
+ * transaction. Hence it needs to differenciate from NEEDED.
+ */
+ MAP_CHG_ENFORCED = 2,
+} map_chg_state;
+
+/*
+ * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
+ * faults of hugetlb private mappings on top of a non-page-cache folio (in
+ * which case even if there's a private vma resv map it won't cover such
+ * allocation). New call sites should (probably) never set it to true!!
+ * When it's set, the allocation will bypass all vma level reservations.
+ */
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
+ unsigned long addr, bool cow_from_owner)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
- long gbl_chg;
- int memcg_charge_ret, ret, idx;
+ long retval, gbl_chg, gbl_reserve;
+ map_chg_state map_chg;
+ int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
- struct mem_cgroup *memcg;
- bool deferred_reserve;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
- memcg = get_mem_cgroup_from_current();
- memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
- if (memcg_charge_ret == -ENOMEM) {
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
- }
-
idx = hstate_index(h);
- /*
- * Examine the region/reserve map to determine if the process
- * has a reservation for the page to be allocated. A return
- * code of zero indicates a reservation exists (no change).
- */
- map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0) {
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
+
+ /* Whether we need a separate per-vma reservation? */
+ if (cow_from_owner) {
+ /*
+ * Special case! Since it's a CoW on top of a reserved
+ * page, the private resv map doesn't count. So it cannot
+ * consume the per-vma resv map even if it's reserved.
+ */
+ map_chg = MAP_CHG_ENFORCED;
+ } else {
+ /*
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
+ */
+ retval = vma_needs_reservation(h, vma, addr);
+ if (retval < 0)
+ return ERR_PTR(-ENOMEM);
+ map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
}
/*
+ * Whether we need a separate global reservation?
+ *
* Processes that did not create the mapping will have no
* reserves as indicated by the region/reserve map. Check
* that the allocation will not exceed the subpool limit.
- * Allocations for MAP_NORESERVE mappings also need to be
- * checked against any subpool limit.
+ * Or if it can get one from the pool reservation directly.
*/
- if (map_chg || avoid_reserve) {
+ if (map_chg) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0)
goto out_end_reservation;
-
+ } else {
/*
- * Even though there was no reservation in the region/reserve
- * map, there could be reservations associated with the
- * subpool that can be used. This would be indicated if the
- * return value of hugepage_subpool_get_pages() is zero.
- * However, if avoid_reserve is specified we still avoid even
- * the subpool reservations.
+ * If we have the vma reservation ready, no need for extra
+ * global reservation.
*/
- if (avoid_reserve)
- gbl_chg = 1;
+ gbl_chg = 0;
}
- /* If this allocation is not consuming a reservation, charge it now.
+ /*
+ * If this allocation is not consuming a per-vma reservation,
+ * charge the hugetlb cgroup now.
*/
- deferred_reserve = map_chg || avoid_reserve;
- if (deferred_reserve) {
+ if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
if (ret)
@@ -3040,27 +3042,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
if (!folio) {
spin_unlock_irq(&hugetlb_lock);
folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
if (!folio)
goto out_uncharge_cgroup;
spin_lock_irq(&hugetlb_lock);
- if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
list_add(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
/* Fall through */
}
+ /*
+ * Either dequeued or buddy-allocated folio needs to add special
+ * mark to the folio when it consumes a global reservation.
+ */
+ if (!gbl_chg) {
+ folio_set_hugetlb_restore_reserve(folio);
+ h->resv_huge_pages--;
+ }
+
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
- if (deferred_reserve) {
+ if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
h_cg, folio);
}
@@ -3069,53 +3076,122 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_set_folio_subpool(folio, spool);
- map_commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(map_chg > map_commit)) {
+ if (map_chg != MAP_CHG_ENFORCED) {
+ /* commit() is only needed if the map_chg is not enforced */
+ retval = vma_commit_reservation(h, vma, addr);
/*
+ * Check for possible race conditions. When it happens..
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
* This indicates a race with hugetlb_reserve_pages.
* Adjust for the subpool count incremented above AND
- * in hugetlb_reserve_pages for the same page. Also,
+ * in hugetlb_reserve_pages for the same page. Also,
* the reservation count added in hugetlb_reserve_pages
* no longer applies.
*/
- long rsv_adjust;
+ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
+ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1);
- hugetlb_acct_memory(h, -rsv_adjust);
- if (deferred_reserve) {
- spin_lock_irq(&hugetlb_lock);
- hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
- pages_per_huge_page(h), folio);
- spin_unlock_irq(&hugetlb_lock);
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ if (map_chg) {
+ spin_lock_irq(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_folio_rsvd(
+ hstate_index(h), pages_per_huge_page(h),
+ folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
}
}
- if (!memcg_charge_ret)
- mem_cgroup_commit_charge(folio, memcg);
+ ret = mem_cgroup_charge_hugetlb(folio, gfp);
+ /*
+ * Unconditionally increment NR_HUGETLB here. If it turns out that
+ * mem_cgroup_charge_hugetlb failed, then immediately free the page and
+ * decrement NR_HUGETLB.
+ */
lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
- mem_cgroup_put(memcg);
+
+ if (ret == -ENOMEM) {
+ free_huge_folio(folio);
+ return ERR_PTR(-ENOMEM);
+ }
return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
- if (deferred_reserve)
+ if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
+ /*
+ * put page to subpool iff the quota of subpool's rsv_hpages is used
+ * during hugepage_subpool_get_pages.
+ */
+ if (map_chg && !gbl_chg) {
+ gbl_reserve = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+
out_end_reservation:
- vma_end_reservation(h, vma, addr);
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
+ if (map_chg != MAP_CHG_ENFORCED)
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
+static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
+{
+ struct huge_bootmem_page *m;
+ int listnode = nid;
+
+ if (hugetlb_early_cma(h))
+ m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
+ else {
+ if (node_exact)
+ m = memblock_alloc_exact_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ else {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ /*
+ * For pre-HVO to work correctly, pages need to be on
+ * the list for the node they were actually allocated
+ * from. That node may be different in the case of
+ * fallback by memblock_alloc_try_nid_raw. So,
+ * extract the actual node first.
+ */
+ if (m)
+ listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
+ }
+
+ if (m) {
+ m->flags = 0;
+ m->cma = NULL;
+ }
+ }
+
+ if (m) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ *
+ * Put them into a private list first because mem_map
+ * is not up yet.
+ */
+ INIT_LIST_HEAD(&m->list);
+ list_add(&m->list, &huge_boot_pages[listnode]);
+ m->hstate = h;
+ }
+
+ return m;
+}
+
int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
@@ -3125,22 +3201,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
- m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ m = alloc_bootmem(h, node, true);
if (!m)
return 0;
goto found;
}
+
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
- m = memblock_alloc_try_nid_raw(
- huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
+ &hugetlb_bootmem_nodes) {
+ m = alloc_bootmem(h, node, false);
if (!m)
return 0;
goto found;
@@ -3157,10 +3227,7 @@ found:
*/
memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
huge_page_size(h) - PAGE_SIZE);
- /* Put them into a private list first because mem_map is not up yet */
- INIT_LIST_HEAD(&m->list);
- list_add(&m->list, &huge_boot_pages[node]);
- m->hstate = h;
+
return 1;
}
@@ -3178,7 +3245,6 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
- __ClearPageReserved(folio_page(folio, pfn - head_pfn));
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
ret = page_ref_freeze(page, 1);
@@ -3202,6 +3268,42 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
prep_compound_head((struct page *)folio, huge_page_order(h));
}
+static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_HVO;
+}
+
+static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_CMA;
+}
+
+/*
+ * memblock-allocated pageblocks might not have the migrate type set
+ * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
+ * here, or MIGRATE_CMA if this was a page allocated through an early CMA
+ * reservation.
+ *
+ * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
+ * read-only, but that's ok - for sparse vmemmap this does not write to
+ * the page structure.
+ */
+static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
+ struct hstate *h)
+{
+ unsigned long nr_pages = pages_per_huge_page(h), i;
+
+ WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
+
+ for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
+ if (folio_test_hugetlb_cma(folio))
+ init_cma_pageblock(folio_page(folio, i));
+ else
+ set_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE);
+ }
+}
+
static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct list_head *folio_list)
{
@@ -3209,7 +3311,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct folio *folio, *tmp_f;
/* Send list for bulk vmemmap optimization processing */
- hugetlb_vmemmap_optimize_folios(h, folio_list);
+ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
@@ -3223,6 +3325,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
HUGETLB_VMEMMAP_RESERVE_PAGES,
pages_per_huge_page(h));
}
+ hugetlb_bootmem_init_migratetype(folio, h);
/* Subdivide locks to achieve better parallel performance */
spin_lock_irqsave(&hugetlb_lock, flags);
__prep_account_new_huge_page(h, folio_nid(folio));
@@ -3231,6 +3334,57 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
}
}
+bool __init hugetlb_bootmem_page_zones_valid(int nid,
+ struct huge_bootmem_page *m)
+{
+ unsigned long start_pfn;
+ bool valid;
+
+ if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
+ /*
+ * Already validated, skip check.
+ */
+ return true;
+ }
+
+ if (hugetlb_bootmem_page_earlycma(m)) {
+ valid = cma_validate_zones(m->cma);
+ goto out;
+ }
+
+ start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
+
+ valid = !pfn_range_intersects_zones(nid, start_pfn,
+ pages_per_huge_page(m->hstate));
+out:
+ if (!valid)
+ hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
+
+ return valid;
+}
+
+/*
+ * Free a bootmem page that was found to be invalid (intersecting with
+ * multiple zones).
+ *
+ * Since it intersects with multiple zones, we can't just do a free
+ * operation on all pages at once, but instead have to walk all
+ * pages, freeing them one by one.
+ */
+static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
+ struct hstate *h)
+{
+ unsigned long npages = pages_per_huge_page(h);
+ unsigned long pfn;
+
+ while (npages--) {
+ pfn = page_to_pfn(page);
+ __init_page_from_nid(pfn, nid);
+ free_reserved_page(page);
+ page++;
+ }
+}
+
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
@@ -3238,14 +3392,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
LIST_HEAD(folio_list);
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m, *tm;
struct hstate *h = NULL, *prev_h = NULL;
- list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
struct page *page = virt_to_page(m);
struct folio *folio = (void *)page;
h = m->hstate;
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Can't use this page. Initialize the
+ * page structures if that hasn't already
+ * been done, and give them to the page
+ * allocator.
+ */
+ hugetlb_bootmem_free_invalid_page(nid, page, h);
+ continue;
+ }
+
/*
* It is possible to have multiple huge page sizes (hstates)
* in this list. If so, process each size separately.
@@ -3260,14 +3425,30 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
hugetlb_folio_init_vmemmap(folio, h,
HUGETLB_VMEMMAP_RESERVE_PAGES);
init_new_hugetlb_folio(h, folio);
+
+ if (hugetlb_bootmem_page_prehvo(m))
+ /*
+ * If pre-HVO was done, just set the
+ * flag, the HVO code will then skip
+ * this folio.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
+
+ if (hugetlb_bootmem_page_earlycma(m))
+ folio_set_hugetlb_cma(folio);
+
list_add(&folio->lru, &folio_list);
/*
* We need to restore the 'stolen' pages to totalram_pages
* in order to fix confusing memory reports from free(1) and
* other side-effects, like CommitLimit going negative.
+ *
+ * For CMA pages, this is done in init_cma_pageblock
+ * (via hugetlb_bootmem_init_migratetype), so skip it here.
*/
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ if (!folio_test_hugetlb_cma(folio))
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
@@ -3289,7 +3470,7 @@ static void __init gather_bootmem_prealloc(void)
.thread_fn = gather_bootmem_prealloc_parallel,
.fn_arg = NULL,
.start = 0,
- .size = num_node_state(N_MEMORY),
+ .size = nr_node_ids,
.align = 1,
.min_chunk = 1,
.max_threads = num_node_state(N_MEMORY),
@@ -3407,32 +3588,44 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
.numa_aware = true
};
+ unsigned long jiffies_start;
+ unsigned long jiffies_end;
+
job.thread_fn = hugetlb_pages_alloc_boot_node;
job.start = 0;
job.size = h->max_huge_pages;
/*
- * job.max_threads is twice the num_node_state(N_MEMORY),
+ * job.max_threads is 25% of the available cpu threads by default.
*
- * Tests below indicate that a multiplier of 2 significantly improves
- * performance, and although larger values also provide improvements,
- * the gains are marginal.
+ * On large servers with terabytes of memory, huge page allocation
+ * can consume a considerably amount of time.
*
- * Therefore, choosing 2 as the multiplier strikes a good balance between
- * enhancing parallel processing capabilities and maintaining efficient
- * resource management.
+ * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
+ * 2MiB huge pages. Using more threads can significantly improve allocation time.
*
- * +------------+-------+-------+-------+-------+-------+
- * | multiplier | 1 | 2 | 3 | 4 | 5 |
- * +------------+-------+-------+-------+-------+-------+
- * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
- * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms |
- * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms |
- * +------------+-------+-------+-------+-------+-------+
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | threads | 8 | 16 | 32 | 64 | 128 |
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s |
+ * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s |
+ * +-----------------------+-------+-------+-------+-------+-------+
*/
- job.max_threads = num_node_state(N_MEMORY) * 2;
- job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+ if (hugepage_allocation_threads == 0) {
+ hugepage_allocation_threads = num_online_cpus() / 4;
+ hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
+ }
+
+ job.max_threads = hugepage_allocation_threads;
+ job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
+
+ jiffies_start = jiffies;
padata_do_multithreaded(&job);
+ jiffies_end = jiffies;
+
+ pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
+ jiffies_to_msecs(jiffies_end - jiffies_start),
+ hugepage_allocation_threads);
return h->nr_huge_pages;
}
@@ -3451,23 +3644,17 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long allocated;
- static bool initialized __initdata;
- /* skip gigantic hugepages allocation if hugetlb_cma enabled */
- if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ /*
+ * Skip gigantic hugepages allocation if early CMA
+ * reservations are not available.
+ */
+ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
+ !hugetlb_early_cma(h)) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
return;
}
- /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
- if (!initialized) {
- int i = 0;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- INIT_LIST_HEAD(&huge_boot_pages[i]);
- initialized = true;
- }
-
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -3486,6 +3673,15 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
+ /*
+ * Always reset to first_memory_node here, even if
+ * next_nid_to_alloc was set before - we can't
+ * reference hugetlb_bootmem_nodes after init, and
+ * first_memory_node is right for all further allocations.
+ */
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3500,7 +3696,7 @@ static void __init hugetlb_init_hstates(void)
*/
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
continue;
- if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
continue;
for_each_hstate(h2) {
if (h2 == h)
@@ -3515,13 +3711,20 @@ static void __init hugetlb_init_hstates(void)
static void __init report_hugepages(void)
{
struct hstate *h;
+ unsigned long nrinvalid;
for_each_hstate(h) {
char buf[32];
+ nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
+ h->max_huge_pages -= nrinvalid;
+
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
- buf, h->free_huge_pages);
+ buf, h->nr_huge_pages);
+ if (nrinvalid)
+ pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
+ buf, nrinvalid, nrinvalid > 1 ? "s" : "");
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -3603,6 +3806,7 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
+ unsigned long persistent_free_count;
unsigned long min_count;
unsigned long allocated;
struct folio *folio;
@@ -3737,8 +3941,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
+ *
+ * min_count is the expected number of persistent pages, we
+ * shouldn't calculate min_count by using
+ * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
+ * because there may exist free surplus huge pages, and this will
+ * lead to subtracting twice. Free surplus huge pages come from HVO
+ * failing to restore vmemmap, see comments in the callers of
+ * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
+ * persistent free count first.
*/
- min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ persistent_free_count = h->free_huge_pages;
+ if (h->free_huge_pages > persistent_huge_pages(h)) {
+ if (h->free_huge_pages > h->surplus_huge_pages)
+ persistent_free_count -= h->surplus_huge_pages;
+ else
+ persistent_free_count = 0;
+ }
+ min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
@@ -3795,10 +4015,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
list_for_each_entry_safe(folio, next, src_list, lru) {
int i;
+ bool cma;
if (folio_test_hugetlb_vmemmap_optimized(folio))
continue;
+ cma = folio_test_hugetlb_cma(folio);
+
list_del(&folio->lru);
split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
@@ -3806,13 +4029,18 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
struct page *page = folio_page(folio, i);
+ /* Careful: see __split_huge_page_tail() */
+ struct folio *new_folio = (struct folio *)page;
- page->mapping = NULL;
clear_compound_head(page);
prep_compound_page(page, dst->order);
- init_new_hugetlb_folio(dst, page_folio(page));
- list_add(&page->lru, &dst_list);
+ new_folio->mapping = NULL;
+ init_new_hugetlb_folio(dst, new_folio);
+ /* Copy the CMA flag so that it is freed correctly */
+ if (cma)
+ folio_set_hugetlb_cma(new_folio);
+ list_add(&new_folio->lru, &dst_list);
}
}
@@ -4393,14 +4621,6 @@ static void hugetlb_register_all_nodes(void) { }
#endif
-#ifdef CONFIG_CMA
-static void __init hugetlb_cma_check(void);
-#else
-static inline __init void hugetlb_cma_check(void)
-{
-}
-#endif
-
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
@@ -4414,7 +4634,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
}
#ifdef CONFIG_NUMA
@@ -4525,8 +4745,6 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_memory_node;
- h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/SZ_1K);
@@ -4551,6 +4769,44 @@ static void __init hugepages_clear_pages_in_node(void)
}
}
+static __init int hugetlb_add_param(char *s, int (*setup)(char *))
+{
+ size_t len;
+ char *p;
+
+ if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
+ return -EINVAL;
+
+ len = strlen(s) + 1;
+ if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
+ return -EINVAL;
+
+ p = &hstate_cmdline_buf[hstate_cmdline_index];
+ memcpy(p, s, len);
+ hstate_cmdline_index += len;
+
+ hugetlb_params[hugetlb_param_index].val = p;
+ hugetlb_params[hugetlb_param_index].setup = setup;
+
+ hugetlb_param_index++;
+
+ return 0;
+}
+
+static __init void hugetlb_parse_params(void)
+{
+ int i;
+ struct hugetlb_cmdline *hcp;
+
+ for (i = 0; i < hugetlb_param_index; i++) {
+ hcp = &hugetlb_params[i];
+
+ hcp->setup(hcp->val);
+ }
+
+ hugetlb_cma_validate_params();
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4570,7 +4826,7 @@ static int __init hugepages_setup(char *s)
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return -EINVAL;
}
/*
@@ -4624,24 +4880,16 @@ static int __init hugepages_setup(char *s)
}
}
- /*
- * Global state is always initialized later in hugetlb_init.
- * But we need to allocate gigantic hstates here early to still
- * use the bootmem allocator.
- */
- if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
- hugetlb_hstate_alloc_pages(parsed_hstate);
-
last_mhp = mhp;
- return 1;
+ return 0;
invalid:
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
hugepages_clear_pages_in_node();
- return 1;
+ return -EINVAL;
}
-__setup("hugepages=", hugepages_setup);
+hugetlb_early_param("hugepages", hugepages_setup);
/*
* hugepagesz command line processing
@@ -4660,7 +4908,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
h = size_to_hstate(size);
@@ -4675,7 +4923,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 1;
+ return -EINVAL;
}
/*
@@ -4685,14 +4933,14 @@ static int __init hugepagesz_setup(char *s)
*/
parsed_hstate = h;
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
-__setup("hugepagesz=", hugepagesz_setup);
+hugetlb_early_param("hugepagesz", hugepagesz_setup);
/*
* default_hugepagesz command line input
@@ -4706,14 +4954,14 @@ static int __init default_hugepagesz_setup(char *s)
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 1;
+ return -EINVAL;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4730,17 +4978,89 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- for_each_online_node(i)
- default_hstate.max_huge_pages_node[i] =
- default_hugepages_in_node[i];
- if (hstate_is_gigantic(&default_hstate))
- hugetlb_hstate_alloc_pages(&default_hstate);
+ /*
+ * Since this is an early parameter, we can't check
+ * NUMA node state yet, so loop through MAX_NUMNODES.
+ */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (default_hugepages_in_node[i] != 0)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
+ }
default_hstate_max_huge_pages = 0;
}
+ return 0;
+}
+hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+
+void __init hugetlb_bootmem_set_nodes(void)
+{
+ int i, nid;
+ unsigned long start_pfn, end_pfn;
+
+ if (!nodes_empty(hugetlb_bootmem_nodes))
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (end_pfn > start_pfn)
+ node_set(nid, hugetlb_bootmem_nodes);
+ }
+}
+
+static bool __hugetlb_bootmem_allocated __initdata;
+
+bool __init hugetlb_bootmem_allocated(void)
+{
+ return __hugetlb_bootmem_allocated;
+}
+
+void __init hugetlb_bootmem_alloc(void)
+{
+ struct hstate *h;
+ int i;
+
+ if (__hugetlb_bootmem_allocated)
+ return;
+
+ hugetlb_bootmem_set_nodes();
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&huge_boot_pages[i]);
+
+ hugetlb_parse_params();
+
+ for_each_hstate(h) {
+ h->next_nid_to_alloc = first_online_node;
+
+ if (hstate_is_gigantic(h))
+ hugetlb_hstate_alloc_pages(h);
+ }
+
+ __hugetlb_bootmem_allocated = true;
+}
+
+/*
+ * hugepage_alloc_threads command line parsing.
+ *
+ * When set, use this specific number of threads for the boot
+ * allocation of hugepages.
+ */
+static int __init hugepage_alloc_threads_setup(char *s)
+{
+ unsigned long allocation_threads;
+
+ if (kstrtoul(s, 0, &allocation_threads) != 0)
+ return 1;
+
+ if (allocation_threads == 0)
+ return 1;
+
+ hugepage_allocation_threads = allocation_threads;
+
return 1;
}
-__setup("default_hugepagesz=", default_hugepagesz_setup);
+__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);
static unsigned int allowed_mems_nr(struct hstate *h)
{
@@ -4845,7 +5165,7 @@ out:
return ret;
}
-static struct ctl_table hugetlb_table[] = {
+static const struct ctl_table hugetlb_table[] = {
{
.procname = "nr_hugepages",
.data = NULL,
@@ -4878,7 +5198,7 @@ static struct ctl_table hugetlb_table[] = {
},
};
-static void hugetlb_sysctl_init(void)
+static void __init hugetlb_sysctl_init(void)
{
register_sysctl_init("vm", hugetlb_table);
}
@@ -5086,26 +5406,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -5140,18 +5474,16 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.pagesize = hugetlb_vm_op_pagesize,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
- int writable)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
+ bool try_mkwrite)
{
- pte_t entry;
+ pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
unsigned int shift = huge_page_shift(hstate_vma(vma));
- if (writable) {
- entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
+ entry = pte_mkwrite_novma(pte_mkdirty(entry));
} else {
- entry = huge_pte_wrprotect(mk_huge_pte(page,
- vma->vm_page_prot));
+ entry = pte_wrprotect(entry);
}
entry = pte_mkyoung(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
@@ -5169,6 +5501,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ if (vma->vm_flags & VM_WRITE)
+ set_huge_ptep_writable(vma, address, ptep);
+}
+
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
@@ -5199,7 +5538,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+ pte_t newpte = make_huge_pte(vma, new_folio, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5333,14 +5672,14 @@ again:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
if (IS_ERR(new_folio)) {
folio_put(pte_folio);
ret = PTR_ERR(new_folio);
break;
}
ret = copy_user_large_folio(new_folio, pte_folio,
- ALIGN_DOWN(addr, sz), dst_vma);
+ addr, dst_vma);
folio_put(pte_folio);
if (ret) {
folio_put(new_folio);
@@ -5402,6 +5741,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5417,8 +5757,19 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
@@ -5491,14 +5842,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+ struct folio *folio, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ const bool folio_provided = !!folio;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
- struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
bool adjust_reservation = false;
@@ -5562,14 +5913,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
}
- page = pte_page(pte);
/*
- * If a reference page is supplied, it is because a specific
- * page is being unmapped, not a range. Ensure the page we
- * are about to unmap is the actual page of interest.
+ * If a folio is supplied, it is because a specific
+ * folio is being unmapped, not a range. Ensure the folio we
+ * are about to unmap is the actual folio of interest.
*/
- if (ref_page) {
- if (page != ref_page) {
+ if (folio_provided) {
+ if (folio != page_folio(pte_page(pte))) {
spin_unlock(ptl);
continue;
}
@@ -5579,12 +5929,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ } else {
+ folio = page_folio(pte_page(pte));
}
- pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
@@ -5592,7 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
make_pte_marker(PTE_MARKER_UFFD_WP),
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- hugetlb_remove_rmap(page_folio(page));
+ hugetlb_remove_rmap(folio);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5601,8 +5953,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* reservation bit.
*/
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
- folio_test_anon(page_folio(page))) {
- folio_set_hugetlb_restore_reserve(page_folio(page));
+ folio_test_anon(folio)) {
+ folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
@@ -5626,16 +5978,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* count will not be incremented by free_huge_folio.
* Act as if we consumed the reservation.
*/
- folio_clear_hugetlb_restore_reserve(page_folio(page));
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
vma_add_reservation(h, vma, address);
}
- tlb_remove_page_size(tlb, page, huge_page_size(h));
+ tlb_remove_page_size(tlb, folio_page(folio, 0),
+ folio_size(folio));
/*
- * Bail out after unmapping reference page if supplied
+ * If we were instructed to unmap a specific folio, we're done.
*/
- if (ref_page)
+ if (folio_provided)
break;
}
tlb_end_vma(tlb, vma);
@@ -5697,7 +6050,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
+ unsigned long end, struct folio *folio,
zap_flags_t zap_flags)
{
struct mmu_notifier_range range;
@@ -5709,7 +6062,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+ __unmap_hugepage_range(&tlb, vma, start, end,
+ folio, zap_flags);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
@@ -5722,7 +6076,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page *page, unsigned long address)
+ struct folio *folio, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
@@ -5766,7 +6120,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page, 0);
+ address + huge_page_size(h),
+ folio, 0);
}
i_mmap_unlock_write(mapping);
}
@@ -5787,7 +6142,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
struct folio *new_folio;
- int outside_reserve = 0;
+ bool cow_from_owner = 0;
vm_fault_t ret = 0;
struct mmu_notifier_range range;
@@ -5802,13 +6157,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
if (!unshare && huge_pte_uffd_wp(pte))
return 0;
- /*
- * hugetlb does not support FOLL_FORCE-style write faults that keep the
- * PTE mapped R/O such as maybe_mkwrite() would do.
- */
- if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
- return VM_FAULT_SIGSEGV;
-
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, vmf->address, vmf->pte);
@@ -5837,7 +6185,8 @@ retry_avoidcopy:
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, vmf->address, vmf->pte);
+ set_huge_ptep_maybe_writable(vma, vmf->address,
+ vmf->pte);
delayacct_wpcopy_end();
return 0;
@@ -5856,7 +6205,7 @@ retry_avoidcopy:
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_folio != pagecache_folio)
- outside_reserve = 1;
+ cow_from_owner = true;
folio_get(old_folio);
@@ -5865,7 +6214,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(vmf->ptl);
- new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
if (IS_ERR(new_folio)) {
/*
@@ -5875,7 +6224,7 @@ retry_avoidcopy:
* reliability, unmap the page from child processes. The child
* may get SIGKILLed if it later faults.
*/
- if (outside_reserve) {
+ if (cow_from_owner) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx;
u32 hash;
@@ -5895,8 +6244,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, &old_folio->page,
- vmf->address);
+ unmap_ref_private(mm, vma, old_folio, vmf->address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -5943,7 +6291,7 @@ retry_avoidcopy:
spin_lock(vmf->ptl);
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
/* Break COW or unshare */
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
@@ -6126,7 +6474,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
goto out;
}
- folio = alloc_hugetlb_folio(vma, vmf->address, 0);
+ folio = alloc_hugetlb_folio(vma, vmf->address, false);
if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
@@ -6223,8 +6571,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
- && (vma->vm_flags & VM_SHARED)));
+ new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -6556,7 +6903,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
spinlock_t *ptl;
int ret = -ENOMEM;
struct folio *folio;
- int writable;
bool folio_in_pagecache = false;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
@@ -6594,7 +6940,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
@@ -6636,15 +6982,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
folio_put(*foliop);
ret = -ENOMEM;
*foliop = NULL;
goto out;
}
- ret = copy_user_large_folio(folio, *foliop,
- ALIGN_DOWN(dst_addr, size), dst_vma);
+ ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
folio_put(*foliop);
*foliop = NULL;
if (ret) {
@@ -6711,12 +7056,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- if (wp_enabled || (is_continue && !vm_shared))
- writable = 0;
- else
- writable = dst_vma->vm_flags & VM_WRITE;
-
- _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
+ _dst_pte = make_huge_pte(dst_vma, folio,
+ !wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
* extremely important for hugetlbfs for now since swapping is not
@@ -6909,7 +7250,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg = -1, add = -1;
+ long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7044,8 +7385,16 @@ bool hugetlb_reserve_pages(struct inode *inode,
return true;
out_put_pages:
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ spool_resv = chg - gbl_reserve;
+ if (spool_resv) {
+ /* put sub pool's reservation back, chg - gbl_reserve */
+ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
+ /*
+ * subpool's reserved pages can not be put back due to race,
+ * return to hstate.
+ */
+ hugetlb_acct_memory(h, -gbl_resv);
+ }
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
@@ -7212,7 +7561,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spte = hugetlb_walk(svma, saddr,
vma_mmu_pagesize(svma));
if (spte) {
- get_page(virt_to_page(spte));
+ ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
break;
}
}
@@ -7227,7 +7576,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
- put_page(virt_to_page(spte));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
}
spin_unlock(&mm->page_table_lock);
out:
@@ -7239,10 +7588,6 @@ out:
/*
* unmap huge page backed by shared pte.
*
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
* Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
@@ -7251,18 +7596,27 @@ out:
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
+ unsigned long sz = huge_page_size(hstate_vma(vma));
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
+ if (sz != PMD_SIZE)
+ return 0;
+ if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
- put_page(virt_to_page(ptep));
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
}
@@ -7397,7 +7751,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-bool isolate_hugetlb(struct folio *folio, struct list_head *list)
+/**
+ * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
+ * @folio: the folio to isolate
+ * @list: the list to add the folio to on success
+ *
+ * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
+ * isolated/non-migratable, and moving it from the active list to the
+ * given list.
+ *
+ * Isolation will fail if @folio is not an allocated hugetlb folio, or if
+ * it is already isolated/non-migratable.
+ *
+ * On success, an additional folio reference is taken that must be dropped
+ * using folio_putback_hugetlb() to undo the isolation.
+ *
+ * Return: True if isolation worked, otherwise False.
+ */
+bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
bool ret = true;
@@ -7445,7 +7816,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
-void folio_putback_active_hugetlb(struct folio *folio)
+/**
+ * folio_putback_hugetlb - unisolate a hugetlb folio
+ * @folio: the isolated hugetlb folio
+ *
+ * Putback/un-isolate the hugetlb folio that was previous isolated using
+ * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
+ * back onto the active list.
+ *
+ * Will drop the additional folio reference obtained through
+ * folio_isolate_hugetlb().
+ */
+void folio_putback_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
folio_set_hugetlb_migratable(folio);
@@ -7492,11 +7874,28 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
spin_unlock_irq(&hugetlb_lock);
}
+
+ /*
+ * Our old folio is isolated and has "migratable" cleared until it
+ * is putback. As migration succeeded, set the new folio "migratable"
+ * and add it to the active list.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ folio_set_hugetlb_migratable(new_folio);
+ list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
+ spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7520,8 +7919,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7531,8 +7934,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7547,165 +7952,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
-#ifdef CONFIG_CMA
-static bool cma_reserve_called __initdata;
-
-static int __init cmdline_parse_hugetlb_cma(char *p)
-{
- int nid, count = 0;
- unsigned long tmp;
- char *s = p;
-
- while (*s) {
- if (sscanf(s, "%lu%n", &tmp, &count) != 1)
- break;
-
- if (s[count] == ':') {
- if (tmp >= MAX_NUMNODES)
- break;
- nid = array_index_nospec(tmp, MAX_NUMNODES);
-
- s += count + 1;
- tmp = memparse(s, &s);
- hugetlb_cma_size_in_node[nid] = tmp;
- hugetlb_cma_size += tmp;
-
- /*
- * Skip the separator if have one, otherwise
- * break the parsing.
- */
- if (*s == ',')
- s++;
- else
- break;
- } else {
- hugetlb_cma_size = memparse(p, &p);
- break;
- }
- }
-
- return 0;
-}
-
-early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
-
-void __init hugetlb_cma_reserve(int order)
-{
- unsigned long size, reserved, per_node;
- bool node_specific_cma_alloc = false;
- int nid;
-
- /*
- * HugeTLB CMA reservation is required for gigantic
- * huge pages which could not be allocated via the
- * page allocator. Just warn if there is any change
- * breaking this assumption.
- */
- VM_WARN_ON(order <= MAX_PAGE_ORDER);
- cma_reserve_called = true;
-
- if (!hugetlb_cma_size)
- return;
-
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- if (!node_online(nid)) {
- pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- continue;
- }
-
- if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
- nid, (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- } else {
- node_specific_cma_alloc = true;
- }
- }
-
- /* Validate the CMA size again in case some invalid nodes specified. */
- if (!hugetlb_cma_size)
- return;
-
- if (hugetlb_cma_size < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
- (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size = 0;
- return;
- }
-
- if (!node_specific_cma_alloc) {
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
- }
-
- reserved = 0;
- for_each_online_node(nid) {
- int res;
- char name[CMA_MAX_NAME];
-
- if (node_specific_cma_alloc) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- size = hugetlb_cma_size_in_node[nid];
- } else {
- size = min(per_node, hugetlb_cma_size - reserved);
- }
-
- size = round_up(size, PAGE_SIZE << order);
-
- snprintf(name, sizeof(name), "hugetlb%d", nid);
- /*
- * Note that 'order per bit' is based on smallest size that
- * may be returned to CMA allocator in the case of
- * huge page demotion.
- */
- res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << order,
- HUGETLB_PAGE_ORDER, false, name,
- &hugetlb_cma[nid], nid);
- if (res) {
- pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
- res, nid);
- continue;
- }
-
- reserved += size;
- pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
- size / SZ_1M, nid);
-
- if (reserved >= hugetlb_cma_size)
- break;
- }
-
- if (!reserved)
- /*
- * hugetlb_cma_size is used to determine if allocations from
- * cma are possible. Set to zero if no cma regions are set up.
- */
- hugetlb_cma_size = 0;
-}
-
-static void __init hugetlb_cma_check(void)
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
{
- if (!hugetlb_cma_size || cma_reserve_called)
- return;
-
- pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
}
-
-#endif /* CONFIG_CMA */
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index e716c4671a15..58e895f3899a 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -101,10 +101,9 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
- struct page_counter *fault_parent = NULL;
- struct page_counter *rsvd_parent = NULL;
+ struct page_counter *fault, *fault_parent = NULL;
+ struct page_counter *rsvd, *rsvd_parent = NULL;
unsigned long limit;
- int ret;
if (parent_h_cgroup) {
fault_parent = hugetlb_cgroup_counter_from_cgroup(
@@ -112,24 +111,22 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
parent_h_cgroup, idx);
}
- page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
- idx),
- fault_parent, false);
- page_counter_init(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- rsvd_parent, false);
+ fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
+ rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
+
+ page_counter_init(fault, fault_parent, false);
+ page_counter_init(rsvd, rsvd_parent, false);
+
+ if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
+ fault->track_failcnt = true;
+ rsvd->track_failcnt = true;
+ }
limit = round_down(PAGE_COUNTER_MAX,
pages_per_huge_page(&hstates[idx]));
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
- limit);
- VM_BUG_ON(ret);
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- limit);
- VM_BUG_ON(ret);
+ VM_BUG_ON(page_counter_set_max(fault, limit));
+ VM_BUG_ON(page_counter_set_max(rsvd, limit));
}
}
@@ -195,24 +192,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
* cannot fail.
*/
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
unsigned int nr_pages;
struct page_counter *counter;
- struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
- struct folio *folio = page_folio(page);
- page_hcg = hugetlb_cgroup_from_folio(folio);
+ hcg = hugetlb_cgroup_from_folio(folio);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
* ignore those pages.
*/
- if (!page_hcg || page_hcg != h_cg)
+ if (!hcg || hcg != h_cg)
goto out;
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
@@ -235,13 +231,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
- struct page *page;
+ struct folio *folio;
do {
for_each_hstate(h) {
spin_lock_irq(&hugetlb_lock);
- list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
+ list_for_each_entry(folio, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
spin_unlock_irq(&hugetlb_lock);
}
@@ -917,7 +913,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
list_move(&new_folio->lru, &h->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
- return;
}
static struct cftype hugetlb_files[] = {
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
new file mode 100644
index 000000000000..f58ef4969e7a
--- /dev/null
+++ b/mm/hugetlb_cma.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/cma.h>
+#include <linux/compiler.h>
+#include <linux/mm_inline.h>
+
+#include <asm/page.h>
+#include <asm/setup.h>
+
+#include <linux/hugetlb.h>
+#include "internal.h"
+#include "hugetlb_cma.h"
+
+
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_only;
+static unsigned long hugetlb_cma_size __initdata;
+
+void hugetlb_cma_free_folio(struct folio *folio)
+{
+ int nid = folio_nid(folio);
+
+ WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio));
+}
+
+
+struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ int node;
+ int order = huge_page_order(h);
+ struct folio *folio = NULL;
+
+ if (hugetlb_cma[nid])
+ folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
+
+ if (!folio && !(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
+
+ folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
+ if (folio)
+ break;
+ }
+ }
+
+ if (folio)
+ folio_set_hugetlb_cma(folio);
+
+ return folio;
+}
+
+struct huge_bootmem_page * __init
+hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
+{
+ struct cma *cma;
+ struct huge_bootmem_page *m;
+ int node = *nid;
+
+ cma = hugetlb_cma[*nid];
+ m = cma_reserve_early(cma, huge_page_size(h));
+ if (!m) {
+ if (node_exact)
+ return NULL;
+
+ for_each_node_mask(node, hugetlb_bootmem_nodes) {
+ cma = hugetlb_cma[node];
+ if (!cma || node == *nid)
+ continue;
+ m = cma_reserve_early(cma, huge_page_size(h));
+ if (m) {
+ *nid = node;
+ break;
+ }
+ }
+ }
+
+ if (m) {
+ m->flags = HUGE_BOOTMEM_CMA;
+ m->cma = cma;
+ }
+
+ return m;
+}
+
+
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ hugetlb_cma_size_in_node[nid] = tmp;
+ hugetlb_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ hugetlb_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+static int __init cmdline_parse_hugetlb_cma_only(char *p)
+{
+ return kstrtobool(p, &hugetlb_cma_only);
+}
+
+early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only);
+
+void __init hugetlb_cma_reserve(int order)
+{
+ unsigned long size, reserved, per_node;
+ bool node_specific_cma_alloc = false;
+ int nid;
+
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ VM_WARN_ON(order <= MAX_PAGE_ORDER);
+ cma_reserve_called = true;
+
+ if (!hugetlb_cma_size)
+ return;
+
+ hugetlb_bootmem_set_nodes();
+
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_isset(nid, hugetlb_bootmem_nodes)) {
+ pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+ nid, (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!hugetlb_cma_size)
+ return;
+
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+ (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size = 0;
+ return;
+ }
+
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size,
+ nodes_weight(hugetlb_bootmem_nodes));
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ }
+
+ reserved = 0;
+ for_each_node_mask(nid, hugetlb_bootmem_nodes) {
+ int res;
+ char name[CMA_MAX_NAME];
+
+ if (node_specific_cma_alloc) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = hugetlb_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, hugetlb_cma_size - reserved);
+ }
+
+ size = round_up(size, PAGE_SIZE << order);
+
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_multi(size, PAGE_SIZE << order,
+ HUGETLB_PAGE_ORDER, name,
+ &hugetlb_cma[nid], nid);
+ if (res) {
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= hugetlb_cma_size)
+ break;
+ }
+
+ if (!reserved)
+ /*
+ * hugetlb_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ hugetlb_cma_size = 0;
+}
+
+void __init hugetlb_cma_check(void)
+{
+ if (!hugetlb_cma_size || cma_reserve_called)
+ return;
+
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+bool hugetlb_cma_exclusive_alloc(void)
+{
+ return hugetlb_cma_only;
+}
+
+unsigned long __init hugetlb_cma_total_size(void)
+{
+ return hugetlb_cma_size;
+}
+
+void __init hugetlb_cma_validate_params(void)
+{
+ if (!hugetlb_cma_size)
+ hugetlb_cma_only = false;
+}
+
+bool __init hugetlb_early_cma(struct hstate *h)
+{
+ if (arch_has_huge_bootmem_alloc())
+ return false;
+
+ return hstate_is_gigantic(h) && hugetlb_cma_only;
+}
diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h
new file mode 100644
index 000000000000..f7d7fb9880a2
--- /dev/null
+++ b/mm/hugetlb_cma.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HUGETLB_CMA_H
+#define _LINUX_HUGETLB_CMA_H
+
+#ifdef CONFIG_CMA
+void hugetlb_cma_free_folio(struct folio *folio);
+struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
+struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
+ bool node_exact);
+void hugetlb_cma_check(void);
+bool hugetlb_cma_exclusive_alloc(void);
+unsigned long hugetlb_cma_total_size(void);
+void hugetlb_cma_validate_params(void);
+bool hugetlb_early_cma(struct hstate *h);
+#else
+static inline void hugetlb_cma_free_folio(struct folio *folio)
+{
+}
+
+static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+
+static inline
+struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
+ bool node_exact)
+{
+ return NULL;
+}
+
+static inline void hugetlb_cma_check(void)
+{
+}
+
+static inline bool hugetlb_cma_exclusive_alloc(void)
+{
+ return false;
+}
+
+static inline unsigned long hugetlb_cma_total_size(void)
+{
+ return 0;
+}
+
+static inline void hugetlb_cma_validate_params(void)
+{
+}
+
+static inline bool hugetlb_early_cma(struct hstate *h)
+{
+ return false;
+}
+#endif
+#endif
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 57b7f591eee8..27245e86df25 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* struct page, the special metadata (e.g. page->flags or page->mapping)
* cannot copy to the tail struct page structs. The invalid value will be
* checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
+ * of "corrupted mapping in tail page". We need to reset at least 4 (one
+ * head struct page struct and three tail struct page structs) struct page
* structs.
*/
-#define NR_RESET_STRUCT_PAGE 3
+#define NR_RESET_STRUCT_PAGE 4
static inline void reset_struct_pages(struct page *start)
{
@@ -444,7 +444,11 @@ DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
-core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
+static int __init hugetlb_vmemmap_optimize_param(char *buf)
+{
+ return kstrtobool(buf, &vmemmap_optimize_enabled);
+}
+early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
struct folio *folio, unsigned long flags)
@@ -645,14 +649,39 @@ static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *fol
return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
}
-void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
+ struct list_head *folio_list,
+ bool boot)
{
struct folio *folio;
+ int nr_to_optimize;
LIST_HEAD(vmemmap_pages);
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
+ nr_to_optimize = 0;
list_for_each_entry(folio, folio_list, lru) {
- int ret = hugetlb_vmemmap_split_folio(h, folio);
+ int ret;
+ unsigned long spfn, epfn;
+
+ if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
+ /*
+ * Already optimized by pre-HVO, just map the
+ * mirrored tail page structs RO.
+ */
+ spfn = (unsigned long)&folio->page;
+ epfn = spfn + pages_per_huge_page(h);
+ vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ register_page_bootmem_memmap(pfn_to_section_nr(spfn),
+ &folio->page,
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ continue;
+ }
+
+ nr_to_optimize++;
+
+ ret = hugetlb_vmemmap_split_folio(h, folio);
/*
* Spliting the PMD requires allocating a page, thus lets fail
@@ -664,6 +693,16 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
break;
}
+ if (!nr_to_optimize)
+ /*
+ * All pre-HVO folios, nothing left to do. It's ok if
+ * there is a mix of pre-HVO and not yet HVO-ed folios
+ * here, as __hugetlb_vmemmap_optimize_folio() will
+ * skip any folios that already have the optimized flag
+ * set, see vmemmap_should_optimize_folio().
+ */
+ goto out;
+
flush_tlb_all();
list_for_each_entry(folio, folio_list, lru) {
@@ -689,11 +728,165 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
}
}
+out:
flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages);
}
-static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+{
+ __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
+}
+
+void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
+{
+ __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+
+/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
+static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
+{
+ unsigned long section_size, psize, pmd_vmemmap_size;
+ phys_addr_t paddr;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return false;
+
+ if (!hugetlb_vmemmap_optimizable(m->hstate))
+ return false;
+
+ psize = huge_page_size(m->hstate);
+ paddr = virt_to_phys(m);
+
+ /*
+ * Pre-HVO only works if the bootmem huge page
+ * is aligned to the section size.
+ */
+ section_size = (1UL << PA_SECTION_SHIFT);
+ if (!IS_ALIGNED(paddr, section_size) ||
+ !IS_ALIGNED(psize, section_size))
+ return false;
+
+ /*
+ * The pre-HVO code does not deal with splitting PMDS,
+ * so the bootmem page must be aligned to the number
+ * of base pages that can be mapped with one vmemmap PMD.
+ */
+ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
+ if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
+ !IS_ALIGNED(psize, pmd_vmemmap_size))
+ return false;
+
+ return true;
+}
+
+/*
+ * Initialize memmap section for a gigantic page, HVO-style.
+ */
+void __init hugetlb_vmemmap_init_early(int nid)
+{
+ unsigned long psize, paddr, section_size;
+ unsigned long ns, i, pnum, pfn, nr_pages;
+ unsigned long start, end;
+ struct huge_bootmem_page *m = NULL;
+ void *map;
+
+ /*
+ * Noting to do if bootmem pages were not allocated
+ * early in boot, or if HVO wasn't enabled in the
+ * first place.
+ */
+ if (!hugetlb_bootmem_allocated())
+ return;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return;
+
+ section_size = (1UL << PA_SECTION_SHIFT);
+
+ list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ if (!vmemmap_should_optimize_bootmem_page(m))
+ continue;
+
+ nr_pages = pages_per_huge_page(m->hstate);
+ psize = nr_pages << PAGE_SHIFT;
+ paddr = virt_to_phys(m);
+ pfn = PHYS_PFN(paddr);
+ map = pfn_to_page(pfn);
+ start = (unsigned long)map;
+ end = start + nr_pages * sizeof(struct page);
+
+ if (vmemmap_populate_hvo(start, end, nid,
+ HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
+ continue;
+
+ memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
+
+ pnum = pfn_to_section_nr(pfn);
+ ns = psize / section_size;
+
+ for (i = 0; i < ns; i++) {
+ sparse_init_early_section(nid, map, pnum,
+ SECTION_IS_VMEMMAP_PREINIT);
+ map += section_map_size();
+ pnum++;
+ }
+
+ m->flags |= HUGE_BOOTMEM_HVO;
+ }
+}
+
+void __init hugetlb_vmemmap_init_late(int nid)
+{
+ struct huge_bootmem_page *m, *tm;
+ unsigned long phys, nr_pages, start, end;
+ unsigned long pfn, nr_mmap;
+ struct hstate *h;
+ void *map;
+
+ if (!hugetlb_bootmem_allocated())
+ return;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return;
+
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
+ if (!(m->flags & HUGE_BOOTMEM_HVO))
+ continue;
+
+ phys = virt_to_phys(m);
+ h = m->hstate;
+ pfn = PHYS_PFN(phys);
+ nr_pages = pages_per_huge_page(h);
+
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Oops, the hugetlb page spans multiple zones.
+ * Remove it from the list, and undo HVO.
+ */
+ list_del(&m->list);
+
+ map = pfn_to_page(pfn);
+
+ start = (unsigned long)map;
+ end = start + nr_pages * sizeof(struct page);
+
+ vmemmap_undo_hvo(start, end, nid,
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
+ memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
+
+ memblock_phys_free(phys, huge_page_size(h));
+ continue;
+ } else
+ m->flags |= HUGE_BOOTMEM_ZONES_VALID;
+ }
+}
+#endif
+
+static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
{
.procname = "hugetlb_optimize_vmemmap",
.data = &vmemmap_optimize_enabled,
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 2fcae92d3359..18b490825215 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -9,6 +9,8 @@
#ifndef _LINUX_HUGETLB_VMEMMAP_H
#define _LINUX_HUGETLB_VMEMMAP_H
#include <linux/hugetlb.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
/*
* Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
@@ -24,6 +26,12 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct list_head *non_hvo_folios);
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
+void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+void hugetlb_vmemmap_init_early(int nid);
+void hugetlb_vmemmap_init_late(int nid);
+#endif
+
static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
{
@@ -48,7 +56,7 @@ static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct f
return 0;
}
-static long hugetlb_vmemmap_restore_folios(const struct hstate *h,
+static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct list_head *folio_list,
struct list_head *non_hvo_folios)
{
@@ -64,6 +72,19 @@ static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list
{
}
+static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+}
+
+static inline void hugetlb_vmemmap_init_early(int nid)
+{
+}
+
+static inline void hugetlb_vmemmap_init_late(int nid)
+{
+}
+
static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
{
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..4600e7605cab 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,7 +40,8 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
- .mm_lock_seq = 0,
+ .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
+ .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
diff --git a/mm/internal.h b/mm/internal.h
index cb8d8e8e3ffa..6b8ed2017743 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -25,6 +25,47 @@
struct folio_batch;
/*
+ * Maintains state across a page table move. The operation assumes both source
+ * and destination VMAs already exist and are specified by the user.
+ *
+ * Partial moves are permitted, but the old and new ranges must both reside
+ * within a VMA.
+ *
+ * mmap lock must be held in write and VMA write locks must be held on any VMA
+ * that is visible.
+ *
+ * Use the PAGETABLE_MOVE() macro to initialise this struct.
+ *
+ * The old_addr and new_addr fields are updated as the page table move is
+ * executed.
+ *
+ * NOTE: The page table move is affected by reading from [old_addr, old_end),
+ * and old_addr may be updated for better page table alignment, so len_in
+ * represents the length of the range being copied as specified by the user.
+ */
+struct pagetable_move_control {
+ struct vm_area_struct *old; /* Source VMA. */
+ struct vm_area_struct *new; /* Destination VMA. */
+ unsigned long old_addr; /* Address from which the move begins. */
+ unsigned long old_end; /* Exclusive address at which old range ends. */
+ unsigned long new_addr; /* Address to move page tables to. */
+ unsigned long len_in; /* Bytes to remap specified by user. */
+
+ bool need_rmap_locks; /* Do rmap locks need to be taken? */
+ bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
+ struct pagetable_move_control name = { \
+ .old = old_, \
+ .new = new_, \
+ .old_addr = old_addr_, \
+ .old_end = (old_addr_) + (len_), \
+ .new_addr = new_addr_, \
+ .len_in = len_, \
+ }
+
+/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
* about IO, FS and watermark checking while ignoring placement
@@ -84,6 +125,8 @@ void page_writeback_init(void);
*/
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ return -1;
return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}
@@ -205,11 +248,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
bool *any_writable, bool *any_young, bool *any_dirty)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
bool writable, young, dirty;
- int nr;
+ int nr, cur_nr;
if (any_writable)
*any_writable = false;
@@ -222,11 +263,15 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
+
nr = pte_batch_hint(start_ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
ptep = start_ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
@@ -239,14 +284,6 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (!pte_same(pte, expected_pte))
break;
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
- break;
-
if (any_writable)
*any_writable |= writable;
if (any_young)
@@ -254,12 +291,13 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (any_dirty)
*any_dirty |= dirty;
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ return min(nr, max_nr);
}
/**
@@ -392,6 +430,11 @@ void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, struct zap_details *details);
+int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
+ gfp_t gfp);
void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
unsigned int order);
@@ -491,6 +534,7 @@ extern char * const zone_names[MAX_NR_ZONES];
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
extern int min_free_kbytes;
+extern int defrag_mode;
void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
@@ -656,6 +700,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
void set_zone_contiguous(struct zone *zone);
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages);
static inline void clear_zone_contiguous(struct zone *zone)
{
@@ -680,8 +726,8 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
return;
folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
-#ifdef CONFIG_64BIT
- folio->_folio_nr_pages = 1U << order;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 1U << order;
#endif
}
@@ -717,9 +763,17 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
folio_set_order(folio, order);
atomic_set(&folio->_large_mapcount, -1);
- atomic_set(&folio->_entire_mapcount, -1);
- atomic_set(&folio->_nr_pages_mapped, 0);
- atomic_set(&folio->_pincount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ folio->_mm_ids = 0;
+ folio->_mm_id_mapcount[0] = -1;
+ folio->_mm_id_mapcount[1] = -1;
+ }
+ if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
+ atomic_set(&folio->_pincount, 0);
+ atomic_set(&folio->_entire_mapcount, -1);
+ }
if (order > 1)
INIT_LIST_HEAD(&folio->_deferred_list);
}
@@ -733,17 +787,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx)
set_page_private(p, 0);
}
-extern void prep_compound_page(struct page *page, unsigned int order);
-
-extern void post_alloc_hook(struct page *page, unsigned int order,
- gfp_t gfp_flags);
+void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);
extern int user_min_free_kbytes;
-void free_unref_page(struct page *page, unsigned int order);
+struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
+ nodemask_t *);
+#define __alloc_frozen_pages(...) \
+ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
+void free_frozen_pages(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);
+#ifdef CONFIG_NUMA
+struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
+#else
+static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
+{
+ return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
+}
+#endif
+
+#define alloc_frozen_pages(...) \
+ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
+
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -824,17 +891,29 @@ int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype);
-
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+struct cma;
+
+#ifdef CONFIG_CMA
+void *cma_reserve_early(struct cma *cma, unsigned long size);
+void init_cma_pageblock(struct page *page);
+#else
+static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
+{
+ return NULL;
+}
+static inline void init_cma_pageblock(struct page *page)
+{
+}
+#endif
+
+
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal);
+ int migratetype, bool claimable);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
@@ -1040,6 +1119,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages);
bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void init_deferred_page(unsigned long pfn, int nid);
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -1084,9 +1165,13 @@ static inline void mminit_verify_zonelist(void)
#define NODE_RECLAIM_SUCCESS 1
#ifdef CONFIG_NUMA
+extern int node_reclaim_mode;
+
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
+#define node_reclaim_mode 0
+
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
@@ -1098,11 +1183,17 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
}
#endif
+static inline bool node_reclaim_enabled(void)
+{
+ /* Is any node_reclaim_mode bit set? */
+ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
/*
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1125,8 +1216,9 @@ unsigned long page_mapped_in_vma(const struct page *page,
struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
@@ -1174,6 +1266,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
/* Flags that allow allocations below the min watermark. */
@@ -1285,12 +1378,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
-static inline bool alloc_zeroed(void)
-{
- return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
- &init_on_alloc);
-}
-
/*
* Parses a string with mem suffixes into its order. Useful to parse kernel
* parameters.
@@ -1400,7 +1487,8 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
}
extern bool mirrored_kernelcore;
-extern bool memblock_has_mirror(void);
+bool memblock_has_mirror(void);
+void memblock_free_all(void);
static __always_inline void vma_set_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
@@ -1441,27 +1529,12 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte
void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid);
+void __meminit __init_page_from_nid(unsigned long pfn, int nid);
/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
int priority);
-#ifdef CONFIG_64BIT
-static inline int can_do_mseal(unsigned long flags)
-{
- if (flags)
- return -EINVAL;
-
- return 0;
-}
-
-#else
-static inline int can_do_mseal(unsigned long flags)
-{
- return -EPERM;
-}
-#endif
-
#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
struct shrinker *shrinker, const char *fmt, va_list ap)
@@ -1510,12 +1583,15 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
+#define mapping_set_update(xas, mapping) do { \
+ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \
+ xas_set_update(xas, workingset_update_node); \
+ xas_set_lru(xas, &shadow_nodes); \
+ } \
+} while (0)
/* mremap.c */
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack);
+unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
@@ -1530,4 +1606,25 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+/* pt_reclaim.c */
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval);
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb);
+
+#ifdef CONFIG_PT_RECLAIM
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details);
+#else
+static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return false;
+}
+#endif /* CONFIG_PT_RECLAIM */
+
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
index 01b362799930..d3586e95c12c 100644
--- a/mm/io-mapping.c
+++ b/mm/io-mapping.c
@@ -21,9 +21,10 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
return -EINVAL;
- /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
- return remap_pfn_range_notrack(vma, addr, pfn, size,
- __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+ pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
+
+ /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
+ return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
}
EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3e049dfb28bd..c36dd9f62fd5 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -50,9 +50,9 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
#ifndef ioremap_prot
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
+ pgprot_t prot)
{
- return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
+ return generic_ioremap_prot(phys_addr, size, prot);
}
EXPORT_SYMBOL(ioremap_prot);
#endif
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1a958e7c8a46..dd93ae8a6beb 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN)
ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
# we need to treat them normally (as builtins), otherwise the compiler won't
@@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
CFLAGS_KASAN_TEST += -fno-builtin
endif
+CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1)
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 8b9e348113b1..d54e89f8c3e7 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
sizeof(struct kasan_free_meta) : 0);
}
-static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
+/*
+ * This function avoids dynamic memory allocations and thus can be called from
+ * contexts that do not allow allocating memory.
+ */
+void kasan_record_aux_stack(void *addr)
{
struct slab *slab = kasan_addr_to_slab(addr);
struct kmem_cache *cache;
@@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
-}
-
-void kasan_record_aux_stack(void *addr)
-{
- return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
-}
-
-void kasan_record_aux_stack_noalloc(void *addr)
-{
- return __kasan_record_aux_stack(addr, 0);
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, 0);
}
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index ccd66c7a4081..9a6927394b54 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/static_key.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
@@ -263,8 +264,8 @@ void __init kasan_init_hw_tags(void)
pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
kasan_mode_info(),
- kasan_vmalloc_enabled() ? "on" : "off",
- kasan_stack_collection_enabled() ? "on" : "off");
+ str_on_off(kasan_vmalloc_enabled()),
+ str_on_off(kasan_stack_collection_enabled()));
}
#ifdef CONFIG_KASAN_VMALLOC
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index b7e4b81421b3..129178be5e64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr)
/**
* kasan_poison - mark the memory range as inaccessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size, must be aligned to KASAN_GRANULE_SIZE
- * @value - value that's written to metadata for the range
- * @init - whether to initialize the memory range (only for hardware tag-based)
+ * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size, must be aligned to KASAN_GRANULE_SIZE
+ * @value: value that's written to metadata for the range
+ * @init: whether to initialize the memory range (only for hardware tag-based)
*/
void kasan_poison(const void *addr, size_t size, u8 value, bool init);
/**
* kasan_unpoison - mark the memory range as accessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size, can be unaligned
- * @init - whether to initialize the memory range (only for hardware tag-based)
+ * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size, can be unaligned
+ * @init: whether to initialize the memory range (only for hardware tag-based)
*
* For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
* marking the range.
@@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr);
/**
* kasan_poison_last_granule - mark the last granule of the memory range as
* inaccessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size
+ * @address: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size
*
* This function is only available for the generic mode, as it's the only mode
* that has partially poisoned memory granules.
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index e0ec5a6d15be..5f922dd38ffa 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -33,7 +33,7 @@
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
-MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
static bool multishot;
@@ -47,8 +47,8 @@ static struct {
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
*/
-void *kasan_ptr_result;
-int kasan_int_result;
+static volatile void *kasan_ptr_result;
+static volatile int kasan_int_result;
/* Probe for console output: obtains test_status lines of interest. */
static void probe_console(void *ignore, const char *buf, size_t len)
@@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
-static void empty_cache_ctor(void *object) { }
-
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
- /* Provide a constructor to prevent cache merging. */
- cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
kmem_cache_destroy(cache);
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
@@ -1570,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test)
static void kasan_strings(struct kunit *test)
{
char *ptr;
+ char *src;
size_t size = 24;
/*
@@ -1581,6 +1579,25 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+
+ /*
+ * Make sure that strscpy() does not trigger KASAN if it overreads into
+ * poisoned memory.
+ *
+ * The expected size does not include the terminator '\0'
+ * so it is (KASAN_GRANULE_SIZE - 2) ==
+ * KASAN_GRANULE_SIZE - ("initial removed character" + "\0").
+ */
+ KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2,
+ strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
+
+ /* strscpy should fail if the first byte is unreadable. */
+ KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KASAN_GRANULE_SIZE));
+
+ kfree(src);
kfree(ptr);
/*
@@ -2130,4 +2147,5 @@ static struct kunit_suite kasan_kunit_test_suite = {
kunit_test_suite(kasan_kunit_test_suite);
+MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities");
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 50fb19ad4388..8357e1a33699 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -201,7 +201,7 @@ static inline void fail_non_kasan_kunit_test(void) { }
#endif /* CONFIG_KUNIT */
-static DEFINE_SPINLOCK(report_lock);
+static DEFINE_RAW_SPINLOCK(report_lock);
static void start_report(unsigned long *flags, bool sync)
{
@@ -212,7 +212,7 @@ static void start_report(unsigned long *flags, bool sync)
lockdep_off();
/* Make sure we don't end up in loop. */
report_suppress_start();
- spin_lock_irqsave(&report_lock, *flags);
+ raw_spin_lock_irqsave(&report_lock, *flags);
pr_err("==================================================================\n");
}
@@ -222,7 +222,7 @@ static void end_report(unsigned long *flags, const void *addr, bool is_write)
trace_error_report_end(ERROR_DETECTOR_KASAN,
(unsigned long)addr);
pr_err("==================================================================\n");
- spin_unlock_irqrestore(&report_lock, *flags);
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
check_panic_on_warn("KASAN");
switch (kasan_arg_fault) {
@@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+/*
+ * This function is invoked with report_lock (a raw_spinlock) held. A
+ * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
+ * rt_spinlock.
+ *
+ * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
+ * lockdep warning for this raw_spinlock -> spinlock dependency. This config
+ * option is enabled by default to ensure better test coverage to expose this
+ * kind of RT kernel problem. This lockdep splat, however, can be suppressed
+ * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
+ * invalid PREEMPT_RT case has been taken care of.
+ */
+static inline struct vm_struct *kasan_find_vm_area(void *addr)
+{
+ static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
+ struct vm_struct *va;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return NULL;
+
+ /*
+ * Suppress lockdep warning and fetch vmalloc area of the
+ * offending address.
+ */
+ lock_map_acquire_try(&vmalloc_map);
+ va = find_vm_area(addr);
+ lock_map_release(&vmalloc_map);
+ return va;
+}
+
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = find_vm_area(addr);
+ struct vm_struct *va = kasan_find_vm_area(addr);
if (va) {
pr_err("The buggy address belongs to the virtual mapping at\n"
@@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag,
pr_err("\n");
page = vmalloc_to_page(addr);
+ } else {
+ pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
}
}
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..d2c70cd2afb1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ return 0;
+}
+
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
return 0;
}
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ data.start = start;
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
{
unsigned long shadow_start, shadow_end;
@@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
if (ret)
return ret;
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 220b5d4c6876..b9382b5b6a37 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
@@ -45,7 +46,7 @@ void __init kasan_init_sw_tags(void)
kasan_init_tags();
pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
- kasan_stack_collection_enabled() ? "on" : "off");
+ str_on_off(kasan_stack_collection_enabled()));
}
/*
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 67fc321db79b..102048821c22 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -21,6 +21,7 @@
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
+#include <linux/nodemask.h>
#include <linux/notifier.h>
#include <linux/panic_notifier.h>
#include <linux/random.h>
@@ -1084,6 +1085,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
* properties (e.g. reside in DMAable memory).
*/
if ((flags & GFP_ZONEMASK) ||
+ ((flags & __GFP_THISNODE) && num_online_nodes() > 1) ||
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index f65fb182466d..00034e37bc9f 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/tracepoint.h>
#include <trace/events/printk.h>
@@ -88,7 +89,7 @@ struct expect_report {
static const char *get_access_type(const struct expect_report *r)
{
- return r->is_write ? "write" : "read";
+ return str_write_read(r->is_write);
}
/* Check observed report matches information in @r. */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 6370c5207d1a..10e6802a2edf 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -16,6 +16,7 @@
#include <linux/sprintf.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/sched/clock.h>
#include <trace/events/error_report.h>
@@ -184,7 +185,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
static const char *get_access_type(bool is_write)
{
- return is_write ? "write" : "read";
+ return str_write_read(is_write);
}
void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6f8d46d107b4..15203ea7d007 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -19,6 +19,7 @@
#include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
#include <asm/tlb.h>
@@ -547,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@@ -606,7 +594,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
/* See hpage_collapse_scan_pmd(). */
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -651,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -695,13 +683,13 @@ next:
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
@@ -745,7 +733,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
ptep_clear(vma->vm_mm, address, _pte);
folio_remove_rmap_pte(src, src_page, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_folio_and_swap_cache(src);
}
}
@@ -947,17 +935,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-static int find_pmd_or_thp_or_none(struct mm_struct *mm,
- unsigned long address,
- pmd_t **pmd)
+static inline int check_pmd_state(pmd_t *pmd)
{
- pmd_t pmde;
+ pmd_t pmde = pmdp_get_lockless(pmd);
- *pmd = mm_find_pmd(mm, address);
- if (!*pmd)
- return SCAN_PMD_NULL;
-
- pmde = pmdp_get_lockless(*pmd);
if (pmd_none(pmde))
return SCAN_PMD_NONE;
if (!pmd_present(pmde))
@@ -971,6 +952,17 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
return SCAN_SUCCEED;
}
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ return check_pmd_state(*pmd);
+}
+
static int check_pmd_still_valid(struct mm_struct *mm,
unsigned long address,
pmd_t *pmd)
@@ -1234,7 +1226,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1354,11 +1346,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
/*
* We treat a single page as shared if any part of the THP
- * is shared. "False negatives" from
- * folio_likely_mapped_shared() are not expected to matter
- * much in practice.
+ * is shared.
*/
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -1399,7 +1389,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1432,7 +1422,7 @@ out_unmap:
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
@@ -1461,10 +1451,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
struct vm_fault vmf = {
.vma = vma,
@@ -1473,13 +1462,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
.pmd = pmdp,
};
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1686,7 +1674,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
@@ -1720,7 +1708,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
pmd_t *pmd, pgt_pmd;
spinlock_t *pml;
spinlock_t *ptl;
- bool skipped_uffd = false;
+ bool success = false;
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1757,6 +1745,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
mmu_notifier_invalidate_range_start(&range);
pml = pmd_lock(mm, pmd);
+ /*
+ * The lock of new_folio is still held, we will be blocked in
+ * the page fault path, which prevents the pte entries from
+ * being set again. So even though the old empty PTE page may be
+ * concurrently freed and a new PTE page is filled into the pmd
+ * entry, it is still empty and can be removed.
+ *
+ * So here we only need to recheck if the state of pmd entry
+ * still meets our requirements, rather than checking pmd_same()
+ * like elsewhere.
+ */
+ if (check_pmd_state(pmd) != SCAN_SUCCEED)
+ goto drop_pml;
ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
@@ -1770,20 +1771,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* repeating the anon_vma check protects from one category,
* and repeating the userfaultfd_wp() check from another.
*/
- if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
- skipped_uffd = true;
- } else {
+ if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
+ success = true;
}
if (ptl != pml)
spin_unlock(ptl);
+drop_pml:
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
- if (!skipped_uffd) {
+ if (success) {
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pgt_pmd);
pte_free_defer(mm, pmd_pgtable(pgt_pmd));
@@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
if (result != SCAN_SUCCEED)
goto out;
+ mapping_set_update(&xas, mapping);
+
__folio_set_locked(new_folio);
if (is_shmem)
__folio_set_swapbacked(new_folio);
@@ -2277,6 +2280,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2287,23 +2301,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2315,6 +2333,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2336,14 +2355,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2419,7 +2430,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2765,7 +2776,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2a945c07ae99..8d588e685311 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -210,13 +210,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled = 1;
+static int kmemleak_enabled __read_mostly = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled = 1;
+static int kmemleak_free_enabled __read_mostly = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_late_initialized;
-/* set if a kmemleak warning was issued */
-static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static int kmemleak_error;
@@ -254,7 +252,6 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warn(x); \
dump_stack(); \
- kmemleak_warning = 1; \
} while (0)
/*
@@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq,
* sufficient references to it (count >= min_count)
* - black - ignore, it doesn't contain references (e.g. text section)
* (min_count == -1). No function defined for this color.
- * Newly created objects don't have any color assigned (object->count == -1)
- * before the next memory scan when they become white.
*/
static bool color_white(const struct kmemleak_object *object)
{
@@ -352,6 +347,15 @@ static bool unreferenced_object(struct kmemleak_object *object)
jiffies_last_scan);
}
+static const char *__object_type_str(struct kmemleak_object *object)
+{
+ if (object->flags & OBJECT_PHYS)
+ return " (phys)";
+ if (object->flags & OBJECT_PERCPU)
+ return " (percpu)";
+ return "";
+}
+
/*
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
@@ -364,8 +368,9 @@ static void print_unreferenced(struct seq_file *seq,
unsigned int nr_entries;
nr_entries = stack_depot_fetch(object->trace_handle, &entries);
- warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ warn_or_seq_printf(seq, "unreferenced object%s 0x%08lx (size %zu):\n",
+ __object_type_str(object),
+ object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
hex_dump_object(seq, object);
@@ -373,7 +378,7 @@ static void print_unreferenced(struct seq_file *seq,
for (i = 0; i < nr_entries; i++) {
void *ptr = (void *)entries[i];
- warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr);
+ warn_or_seq_printf(seq, " %pS\n", ptr);
}
}
@@ -384,10 +389,10 @@ static void print_unreferenced(struct seq_file *seq,
*/
static void dump_object_info(struct kmemleak_object *object)
{
- pr_notice("Object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ pr_notice("Object%s 0x%08lx (size %zu):\n",
+ __object_type_str(object), object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
- object->comm, object->pid, object->jiffies);
+ object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%x\n", object->flags);
@@ -1093,7 +1098,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);
if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
- create_object_percpu((__force unsigned long)ptr, size, 0, gfp);
+ create_object_percpu((__force unsigned long)ptr, size, 1, gfp);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -1242,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_transient_leak);
/**
+ * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu
+ * address argument
+ * @ptr: percpu address of the object
+ */
+void __ref kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+ make_black_object((unsigned long)ptr, OBJECT_PERCPU);
+}
+EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
@@ -1689,7 +1708,7 @@ static void kmemleak_scan(void)
unsigned long phys = object->pointer;
if (PHYS_PFN(phys) < min_low_pfn ||
- PHYS_PFN(phys + object->size) >= max_low_pfn)
+ PHYS_PFN(phys + object->size) > max_low_pfn)
__paint_it(object, KMEMLEAK_BLACK);
}
@@ -1855,7 +1874,7 @@ static int kmemleak_scan_thread(void *arg)
* Wait before the first scan to allow the system to fully initialize.
*/
if (first_run) {
- signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
+ signed long timeout = secs_to_jiffies(SECS_FIRST_SCAN);
first_run = 0;
while (timeout && !kthread_should_stop())
timeout = schedule_timeout_interruptible(timeout);
@@ -1998,25 +2017,41 @@ static int kmemleak_open(struct inode *inode, struct file *file)
return seq_open(file, &kmemleak_seq_ops);
}
-static int dump_str_object_info(const char *str)
+static bool __dump_str_object_info(unsigned long addr, unsigned int objflags)
{
unsigned long flags;
struct kmemleak_object *object;
+
+ object = __find_and_get_object(addr, 1, objflags);
+ if (!object)
+ return false;
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ dump_object_info(object);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+
+ return true;
+}
+
+static int dump_str_object_info(const char *str)
+{
unsigned long addr;
+ bool found = false;
if (kstrtoul(str, 0, &addr))
return -EINVAL;
- object = find_and_get_object(addr, 0);
- if (!object) {
+
+ found |= __dump_str_object_info(addr, 0);
+ found |= __dump_str_object_info(addr, OBJECT_PHYS);
+ found |= __dump_str_object_info(addr, OBJECT_PERCPU);
+
+ if (!found) {
pr_info("Unknown object at 0x%08lx\n", addr);
return -EINVAL;
}
- raw_spin_lock_irqsave(&object->lock, flags);
- dump_object_info(object);
- raw_spin_unlock_irqrestore(&object->lock, flags);
-
- put_object(object);
return 0;
}
@@ -2241,7 +2276,7 @@ void __init kmemleak_init(void)
return;
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
- jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+ jiffies_scan_wait = secs_to_jiffies(SECS_SCAN_WAIT);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index a495debf1436..1ea711786c52 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* Make sure we have enough spare bits in @id to hold the UAF bit and
* the chain depth.
*/
- BUILD_BUG_ON(
- (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+ BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <=
+ (KMSAN_MAX_ORIGIN_DEPTH << 1));
extra_bits = stack_depot_get_extra_bits(id);
depth = kmsan_depth_from_eb(extra_bits);
@@ -274,11 +274,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos - 1, user_addr,
reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -292,11 +290,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* poisoned bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -312,11 +308,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
*/
if (cur_origin != new_origin) {
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = new_origin;
cur_off_start = pos + i;
@@ -326,10 +320,8 @@ void kmsan_internal_check_memory(void *addr, size_t size,
}
KMSAN_WARN_ON(pos != size);
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3ea50f09311f..97de3d6194f0 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr,
- page_size(page),
- GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
* Don't check anything, just copy the shadow of the copied
* bytes.
*/
+ kmsan_enter_runtime();
kmsan_internal_memmove_metadata((void *)to, (void *)from,
to_copy - left);
+ kmsan_leave_runtime();
}
user_access_restore(ua_flags);
}
@@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
+EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 10f52c085e6c..b14ce3417e65 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end)
KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
KMSAN_WARN_ON((nstart >= nend) ||
/* Virtual address 0 is valid on s390. */
- (!IS_ENABLED(CONFIG_S390) && !nstart) ||
- !nend);
+ (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend);
nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
nend = ALIGN(nend, PAGE_SIZE);
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 02a405e55d6c..69f0a57a401c 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca);
void __msan_warning(u32 origin);
void __msan_warning(u32 origin)
{
- if (!kmsan_enabled || kmsan_in_runtime())
- return;
- kmsan_enter_runtime();
kmsan_report(origin, /*address*/ NULL, /*size*/ 0,
/*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL,
REASON_ANY);
- kmsan_leave_runtime();
}
EXPORT_SYMBOL(__msan_warning);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 29555a8bc315..bc3d1810f352 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void)
KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
}
-depot_stack_handle_t kmsan_save_stack(void);
depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
unsigned int extra_bits);
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 9733a22c46c1..c6c5b2bbede0 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
+MODULE_DESCRIPTION("Test cases for KMSAN");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
index 94a3303fb65e..d6853ce08954 100644
--- a/mm/kmsan/report.c
+++ b/mm/kmsan/report.c
@@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
unsigned long ua_flags;
bool is_uaf;
- if (!kmsan_enabled)
+ if (!kmsan_enabled || kmsan_in_runtime())
return;
if (current->kmsan_ctx.depth)
return;
if (!origin)
return;
- kmsan_disable_current();
+ kmsan_enter_runtime();
ua_flags = user_access_save();
raw_spin_lock(&kmsan_report_lock);
pr_err("=====================================================\n");
@@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
if (panic_on_kmsan)
panic("kmsan.panic set ...\n");
user_access_restore(ua_flags);
- kmsan_enable_current();
+ kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 9c58f081d84f..54f3c3c962f0 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
if (!kmsan_enabled || kmsan_in_runtime())
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(page_address(page),
- page_size(page),
+ kmsan_internal_poison_memory(page_address(page), page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
+ kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
- kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
flush_cache_vmap(shadow_start, shadow_end);
@@ -280,12 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
start = (void *)PAGE_ALIGN_DOWN((u64)start);
size = PAGE_ALIGN((u64)end - (u64)start);
- shadow = memblock_alloc(size, PAGE_SIZE);
- origin = memblock_alloc(size, PAGE_SIZE);
-
- if (!shadow || !origin)
- panic("%s: Failed to allocate metadata memory for early boot range of size %llu",
- __func__, size);
+ shadow = memblock_alloc_or_panic(size, PAGE_SIZE);
+ origin = memblock_alloc_or_panic(size, PAGE_SIZE);
for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
page = virt_to_page_or_null((char *)start + addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 31a9bc365437..8583fb91ef13 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1270,8 +1270,15 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
- anon_exclusive = PageAnonExclusive(&folio->page);
entry = ptep_get(pvmw.pte);
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that actually
+ * map pages: give up just like the next folio_walk would.
+ */
+ if (unlikely(!pte_present(entry)))
+ goto out_unlock;
+
+ anon_exclusive = PageAnonExclusive(&folio->page);
if (pte_write(entry) || pte_dirty(entry) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
swapped = folio_test_swapcache(folio);
@@ -3262,6 +3269,25 @@ static void wait_while_offlining(void)
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_PROC_FS
+/*
+ * The process is mergeable only if any VMA is currently
+ * applicable to KSM.
+ *
+ * The mmap lock must be held in read mode.
+ */
+bool ksm_process_mergeable(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma)
+ if (vma->vm_flags & VM_MERGEABLE)
+ return true;
+
+ return false;
+}
+
long ksm_process_profit(struct mm_struct *mm)
{
return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f93ada6a207b..490473af3122 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -77,7 +77,6 @@ again:
spin_lock(&l->lock);
nr_items = READ_ONCE(l->nr_items);
if (likely(nr_items != LONG_MIN)) {
- WARN_ON(nr_items < 0);
rcu_read_unlock();
return l;
}
@@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
list_splice_init(&src->list, &dst->list);
if (src->nr_items) {
+ WARN_ON(src->nr_items < 0);
dst->nr_items += src->nr_items;
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
}
@@ -510,7 +510,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp)
{
unsigned long flags;
- struct list_lru_memcg *mlru;
+ struct list_lru_memcg *mlru = NULL;
struct mem_cgroup *pos, *parent;
XA_STATE(xas, &lru->xa, 0);
@@ -535,9 +535,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
parent = parent_mem_cgroup(pos);
}
- mlru = memcg_init_list_lru_one(lru, gfp);
- if (!mlru)
- return -ENOMEM;
+ if (!mlru) {
+ mlru = memcg_init_list_lru_one(lru, gfp);
+ if (!mlru)
+ return -ENOMEM;
+ }
xas_set(&xas, pos->kmemcg_id);
do {
xas_lock_irqsave(&xas, flags);
@@ -548,10 +550,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
}
xas_unlock_irqrestore(&xas, flags);
} while (xas_nomem(&xas, gfp));
- if (mlru)
- kfree(mlru);
} while (pos != memcg && !css_is_dying(&pos->css));
+ if (unlikely(mlru))
+ kfree(mlru);
+
return xas_error(&xas);
}
#else
diff --git a/mm/maccess.c b/mm/maccess.c
index 8f0906180a94..831b4dd7296c 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -196,7 +196,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
- } else if (ret > 0) {
+ } else if (ret >= 0) {
ret++;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 0ceae57da7da..1d44a35ae85c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -48,6 +48,11 @@ struct madvise_walk_private {
bool pageout;
};
+struct madvise_behavior {
+ int behavior;
+ struct mmu_gather *tlb;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
@@ -387,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -486,7 +491,7 @@ restart:
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
@@ -503,6 +508,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -721,7 +727,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (!folio_trylock(folio))
continue;
@@ -736,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -794,12 +801,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static int madvise_free_single_vma(struct vm_area_struct *vma,
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -815,17 +823,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ &madvise_free_walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -848,10 +853,17 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single_batched(
+ madv_behavior->tlb, vma, start, end - start, &details);
return 0;
}
@@ -888,8 +900,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- int behavior)
+ struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
@@ -928,13 +941,23 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(
+ madv_behavior, vma, start, end);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior, vma, start, end);
else
return -EINVAL;
}
@@ -1037,13 +1060,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
if (!allow_locked)
disallowed |= VM_LOCKED;
- if (!vma_is_anonymous(vma))
- return false;
-
- if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
- return false;
-
- return true;
+ return !(vma->vm_flags & disallowed);
}
static bool is_guard_pte_marker(pte_t ptent)
@@ -1241,8 +1258,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
static int madvise_vma_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long behavior)
+ void *behavior_arg)
{
+ struct madvise_behavior *arg = behavior_arg;
+ int behavior = arg->behavior;
int error;
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
@@ -1262,7 +1281,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(vma, prev, start, end, arg);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1384,7 +1403,32 @@ static int madvise_inject_error(int behavior,
return 0;
}
-#endif
+
+static bool is_memory_failure(int behavior)
+{
+ switch (behavior) {
+ case MADV_HWPOISON:
+ case MADV_SOFT_OFFLINE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#else
+
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
+{
+ return 0;
+}
+
+static bool is_memory_failure(int behavior)
+{
+ return false;
+}
+
+#endif /* CONFIG_MEMORY_FAILURE */
static bool
madvise_behavior_valid(int behavior)
@@ -1454,10 +1498,10 @@ static bool process_madvise_remote_valid(int behavior)
*/
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
+ unsigned long end, void *arg,
int (*visit)(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+ unsigned long end, void *arg))
{
struct vm_area_struct *vma;
struct vm_area_struct *prev;
@@ -1515,7 +1559,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long anon_name)
+ void *anon_name)
{
int error;
@@ -1524,7 +1568,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+ anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1556,10 +1600,142 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ return madvise_walk_vmas(mm, start, end, anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
+
+static int madvise_lock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return 0;
+
+ if (madvise_need_mmap_write(behavior)) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ } else {
+ mmap_read_lock(mm);
+ }
+ return 0;
+}
+
+static void madvise_unlock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return;
+
+ if (madvise_need_mmap_write(behavior))
+ mmap_write_unlock(mm);
+ else
+ mmap_read_unlock(mm);
+}
+
+static bool madvise_batch_tlb_flush(int behavior)
+{
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior,
+ struct mm_struct *mm)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, mm);
+}
+
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
+}
+
+static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+{
+ size_t len;
+
+ if (!madvise_behavior_valid(behavior))
+ return false;
+
+ if (!PAGE_ALIGNED(start))
+ return false;
+ len = PAGE_ALIGN(len_in);
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return false;
+
+ if (start + len < start)
+ return false;
+
+ return true;
+}
+
+/*
+ * madvise_should_skip() - Return if the request is invalid or nothing.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ * @behavior: Requested madvise behavor.
+ * @err: Pointer to store an error code from the check.
+ *
+ * If the specified behaviour is invalid or nothing would occur, we skip the
+ * operation. This function returns true in the cases, otherwise false. In
+ * the former case we store an error on @err.
+ */
+static bool madvise_should_skip(unsigned long start, size_t len_in,
+ int behavior, int *err)
+{
+ if (!is_valid_madvise(start, len_in, behavior)) {
+ *err = -EINVAL;
+ return true;
+ }
+ if (start + PAGE_ALIGN(len_in) == start) {
+ *err = 0;
+ return true;
+ }
+ return false;
+}
+
+static bool is_madvise_populate(int behavior)
+{
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int madvise_do_behavior(struct mm_struct *mm,
+ unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
+{
+ int behavior = madv_behavior->behavior;
+ struct blk_plug plug;
+ unsigned long end;
+ int error;
+
+ if (is_memory_failure(behavior))
+ return madvise_inject_error(behavior, start, start + len_in);
+ start = untagged_addr_remote(mm, start);
+ end = start + PAGE_ALIGN(len_in);
+
+ blk_start_plug(&plug);
+ if (is_madvise_populate(behavior))
+ error = madvise_populate(mm, start, end, behavior);
+ else
+ error = madvise_walk_vmas(mm, start, end, madv_behavior,
+ madvise_vma_behavior);
+ blk_finish_plug(&plug);
+ return error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1634,63 +1810,22 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
int error;
- int write;
- size_t len;
- struct blk_plug plug;
-
- if (!madvise_behavior_valid(behavior))
- return -EINVAL;
-
- if (!PAGE_ALIGNED(start))
- return -EINVAL;
- len = PAGE_ALIGN(len_in);
-
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
-
- end = start + len;
- if (end < start)
- return -EINVAL;
-
- if (end == start)
- return 0;
-
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_inject_error(behavior, start, start + len_in);
-#endif
-
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
- start = untagged_addr_remote(mm, start);
- end = start + len;
-
- blk_start_plug(&plug);
- switch (behavior) {
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- error = madvise_populate(mm, start, end, behavior);
- break;
- default:
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
- break;
- }
- blk_finish_plug(&plug);
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ return error;
+ error = madvise_lock(mm, behavior);
+ if (error)
+ return error;
+ madvise_init_tlb(&madv_behavior, mm);
+ error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
return error;
}
@@ -1706,19 +1841,36 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
ssize_t ret = 0;
size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
total_len = iov_iter_count(iter);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ return ret;
+ madvise_init_tlb(&madv_behavior, mm);
+
while (iov_iter_count(iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
- iter_iov_len(iter), behavior);
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
+ size_t len_in = iter_iov_len(iter);
+ int error;
+
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ ret = error;
+ else
+ ret = madvise_do_behavior(mm, start, len_in,
+ &madv_behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
* the operation in aggregate, and would be surprising to the
* user.
*
- * As we have already dropped locks, it is safe to just loop and
+ * We drop and reacquire locks so it is safe to just loop and
* try again. We check for fatal signals in case we need exit
* early anyway.
*/
@@ -1727,13 +1879,24 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
ret = -EINTR;
break;
}
+
+ /* Drop and reacquire lock to unwind race. */
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ goto out;
+ madvise_init_tlb(&madv_behavior, mm);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0389ce5cd281..154f1d73b61f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -16,6 +16,12 @@
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_KEXEC_HANDOVER
+#include <linux/libfdt.h>
+#include <linux/kexec_handover.h>
+#endif /* CONFIG_KEXEC_HANDOVER */
#include <asm/sections.h>
#include <linux/io.h>
@@ -106,6 +112,13 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */
+static bool kho_scratch_only;
+#else
+#define kho_scratch_only false
+#endif
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -165,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void)
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
+ /* skip non-scratch memory for kho early boot allocations */
+ if (kho_scratch_only)
+ return MEMBLOCK_KHO_SCRATCH;
+
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -456,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
- new_array = addr ? __va(addr) : NULL;
+ if (addr) {
+ /* The memory may not have been accepted, yet. */
+ accept_memory(addr, new_alloc_size);
+
+ new_array = __va(addr);
+ } else {
+ new_array = NULL;
+ }
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
@@ -491,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
* needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_alloc_size));
+ BUG_ON(memblock_reserve_kern(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -641,7 +665,7 @@ repeat:
#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
- WARN_ON(flags != rgn->flags);
+ WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags);
nr_new++;
if (insert) {
if (start_rgn == -1)
@@ -735,7 +759,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
/**
* memblock_validate_numa_coverage - check if amount of memory with
* no node ID assigned is less than a threshold
- * @threshold_bytes: maximal number of pages that can have unassigned node
+ * @threshold_bytes: maximal memory size that can have unassigned node
* ID (in bytes).
*
* A buggy firmware may report memory that does not belong to any node.
@@ -755,7 +779,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
nr_pages += end_pfn - start_pfn;
}
- if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) {
+ if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
mem_size_mb = memblock_phys_mem_size() >> 20;
pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
(nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
@@ -901,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size,
+ int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
- &base, &end, (void *)_RET_IP_);
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -923,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
}
#endif
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+__init void memblock_set_kho_scratch_only(void)
+{
+ kho_scratch_only = true;
+}
+
+__init void memblock_clear_kho_scratch_only(void)
+{
+ kho_scratch_only = false;
+}
+
+__init void memmap_init_kho_scratch_pages(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ int nid;
+ u64 i;
+
+ if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
+ return;
+
+ /*
+ * Initialize struct pages for free scratch memory.
+ * The struct pages for reserved scratch memory will be set up in
+ * reserve_bootmem_region()
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
+ for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
+ init_deferred_page(pfn, nid);
+ }
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @type: memblock type to set/clear flag for
@@ -1048,6 +1107,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t
MEMBLOCK_RSRV_NOINIT);
}
+/**
+ * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered
+ * for allocations during early boot with kexec handover.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 1,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
+/**
+ * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a
+ * specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 0,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
static bool should_skip_region(struct memblock_type *type,
struct memblock_region *m,
int nid, int flags)
@@ -1079,6 +1168,13 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
return true;
+ /*
+ * In early alloc during kexec handover, we can only consider
+ * MEMBLOCK_KHO_SCRATCH regions for the allocations
+ */
+ if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m))
+ return true;
+
return false;
}
@@ -1459,14 +1555,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN))
goto done;
if (numa_valid_node(nid) && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_kern(found, size))
goto done;
}
@@ -1692,6 +1788,26 @@ void * __init memblock_alloc_try_nid(
}
/**
+ * __memblock_alloc_or_panic - Try to allocate memory and panic on failure
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @func: caller func name
+ *
+ * This function attempts to allocate memory using memblock_alloc,
+ * and in case of failure, it calls panic with the formatted message.
+ * This function should not be used directly, please use the macro memblock_alloc_or_panic.
+ */
+void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
+ const char *func)
+{
+ void *addr = memblock_alloc(size, align);
+
+ if (unlikely(!addr))
+ panic("%s: Failed to allocate %pap bytes\n", func, &size);
+ return addr;
+}
+
+/**
* memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
@@ -1731,6 +1847,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
+phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid)
+{
+ struct memblock_region *r;
+ phys_addr_t total = 0;
+
+ for_each_reserved_mem_region(r) {
+ phys_addr_t size = r->size;
+
+ if (r->base > limit)
+ break;
+
+ if (r->base + r->size > limit)
+ size = limit - r->base;
+
+ if (nid == memblock_get_region_node(r) || !numa_valid_node(nid))
+ if (r->flags & MEMBLOCK_RSRV_KERN)
+ total += size;
+ }
+
+ return total;
+}
+
/**
* memblock_estimated_nr_free_pages - return estimated number of free pages
* from memblock point of view
@@ -2144,8 +2282,10 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
if (start_pfn >= end_pfn)
return 0;
@@ -2160,11 +2300,14 @@ static void __init memmap_init_reserved_pages(void)
struct memblock_region *region;
phys_addr_t start, end;
int nid;
+ unsigned long max_reserved;
/*
* set nid on all reserved pages and also treat struct
* pages for the NOMAP regions as PageReserved
*/
+repeat:
+ max_reserved = memblock.reserved.max;
for_each_mem_region(region) {
nid = memblock_get_region_node(region);
start = region->base;
@@ -2173,8 +2316,15 @@ static void __init memmap_init_reserved_pages(void)
if (memblock_is_nomap(region))
reserve_bootmem_region(start, end, nid);
- memblock_set_node(start, end, &memblock.reserved, nid);
+ memblock_set_node(start, region->size, &memblock.reserved, nid);
}
+ /*
+ * 'max' is changed means memblock.reserved has been doubled its
+ * array, which may result a new reserved region before current
+ * 'start'. Now we should repeat the procedure to set its node id.
+ */
+ if (max_reserved != memblock.reserved.max)
+ goto repeat;
/*
* initialize struct pages for reserved regions that don't have
@@ -2249,6 +2399,7 @@ void __init memblock_free_all(void)
free_unused_memmap();
reset_all_zones_managed_pages();
+ memblock_clear_kho_scratch_only();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
}
@@ -2263,6 +2414,7 @@ struct reserve_mem_table {
};
static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
static int reserved_mem_count;
+static DEFINE_MUTEX(reserve_mem_lock);
/* Add wildcard region with a lookup name */
static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
@@ -2276,6 +2428,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
strscpy(map->name, name);
}
+static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name)
+{
+ struct reserve_mem_table *map;
+ int i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ map = &reserved_mem_table[i];
+ if (!map->size)
+ continue;
+ if (strcmp(name, map->name) == 0)
+ return map;
+ }
+ return NULL;
+}
+
/**
* reserve_mem_find_by_name - Find reserved memory region with a given name
* @name: The name that is attached to a reserved memory region
@@ -2289,21 +2456,229 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size)
{
struct reserve_mem_table *map;
- int i;
+
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ *start = map->start;
+ *size = map->size;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
+
+/**
+ * reserve_mem_release_by_name - Release reserved memory region with a given name
+ * @name: The name that is attatched to a reserved memory region
+ *
+ * Forcibly release the pages in the reserved memory region so that those memory
+ * can be used as free memory. After released the reserved region size becomes 0.
+ *
+ * Returns: 1 if released or 0 if not found.
+ */
+int reserve_mem_release_by_name(const char *name)
+{
+ char buf[RESERVE_MEM_NAME_SIZE + 12];
+ struct reserve_mem_table *map;
+ void *start, *end;
+
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ start = phys_to_virt(map->start);
+ end = start + map->size - 1;
+ snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
+ free_reserved_area(start, end, 0, buf);
+ map->size = 0;
+
+ return 1;
+}
+
+#ifdef CONFIG_KEXEC_HANDOVER
+#define MEMBLOCK_KHO_FDT "memblock"
+#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
+#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
+static struct page *kho_fdt;
+
+static int reserve_mem_kho_finalize(struct kho_serialization *ser)
+{
+ int err = 0, i;
for (i = 0; i < reserved_mem_count; i++) {
- map = &reserved_mem_table[i];
- if (!map->size)
- continue;
- if (strcmp(name, map->name) == 0) {
- *start = map->start;
- *size = map->size;
- return 1;
- }
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= kho_preserve_phys(map->start, map->size);
}
- return 0;
+
+ err |= kho_preserve_folio(page_folio(kho_fdt));
+ err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
+
+ return notifier_from_errno(err);
}
-EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
+
+static int reserve_mem_kho_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ switch (cmd) {
+ case KEXEC_KHO_FINALIZE:
+ return reserve_mem_kho_finalize((struct kho_serialization *)v);
+ case KEXEC_KHO_ABORT:
+ return NOTIFY_DONE;
+ default:
+ return NOTIFY_BAD;
+ }
+}
+
+static struct notifier_block reserve_mem_kho_nb = {
+ .notifier_call = reserve_mem_kho_notifier,
+};
+
+static int __init prepare_kho_fdt(void)
+{
+ int err = 0, i;
+ void *fdt;
+
+ kho_fdt = alloc_page(GFP_KERNEL);
+ if (!kho_fdt)
+ return -ENOMEM;
+
+ fdt = page_to_virt(kho_fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= fdt_begin_node(fdt, map->name);
+ err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
+ err |= fdt_property(fdt, "size", &map->size, sizeof(map->size));
+ err |= fdt_end_node(fdt);
+ }
+ err |= fdt_end_node(fdt);
+
+ err |= fdt_finish(fdt);
+
+ if (err) {
+ pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+
+static int __init reserve_mem_init(void)
+{
+ int err;
+
+ if (!kho_is_enabled() || !reserved_mem_count)
+ return 0;
+
+ err = prepare_kho_fdt();
+ if (err)
+ return err;
+
+ err = register_kho_notifier(&reserve_mem_kho_nb);
+ if (err) {
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+late_initcall(reserve_mem_init);
+
+static void *__init reserve_mem_kho_retrieve_fdt(void)
+{
+ phys_addr_t fdt_phys;
+ static void *fdt;
+ int err;
+
+ if (fdt)
+ return fdt;
+
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT)
+ pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
+ MEMBLOCK_KHO_FDT, err);
+ return NULL;
+ }
+
+ fdt = phys_to_virt(fdt_phys);
+
+ err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("FDT '%s' is incompatible with '%s': %d\n",
+ MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err);
+ fdt = NULL;
+ }
+
+ return fdt;
+}
+
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ int err, len_start, len_size, offset;
+ const phys_addr_t *p_start, *p_size;
+ const void *fdt;
+
+ fdt = reserve_mem_kho_retrieve_fdt();
+ if (!fdt)
+ return false;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0) {
+ pr_warn("FDT '%s' has no child '%s': %d\n",
+ MEMBLOCK_KHO_FDT, name, offset);
+ return false;
+ }
+ err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("Node '%s' is incompatible with '%s': %d\n",
+ name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err);
+ return false;
+ }
+
+ p_start = fdt_getprop(fdt, offset, "start", &len_start);
+ p_size = fdt_getprop(fdt, offset, "size", &len_size);
+ if (!p_start || len_start != sizeof(*p_start) || !p_size ||
+ len_size != sizeof(*p_size)) {
+ return false;
+ }
+
+ if (*p_start & (align - 1)) {
+ pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n",
+ name, (long)align, (long)*p_start);
+ return false;
+ }
+
+ if (*p_size != size) {
+ pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n",
+ name, (long)*p_size, (long)size);
+ return false;
+ }
+
+ reserved_mem_add(*p_start, size, name);
+ pr_info("Revived memory reservation '%s' from KHO\n", name);
+
+ return true;
+}
+#else
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER */
/*
* Parse reserve_mem=nn:align:name
@@ -2360,6 +2735,11 @@ static int __init reserve_mem(char *p)
if (reserve_mem_find_by_name(name, &start, &tmp))
return -EBUSY;
+ /* Pick previous allocations up from KHO if available */
+ if (reserve_mem_kho_revive(name, size, align))
+ return 1;
+
+ /* TODO: Allocation must be outside of scratch region */
start = memblock_phys_alloc(size, align);
if (!start)
return -ENOMEM;
@@ -2377,6 +2757,8 @@ static const char * const flagname[] = {
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
+ [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN",
+ [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH",
};
static int memblock_debug_show(struct seq_file *m, void *private)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index a071fa43d479..4b94731305b9 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -490,6 +490,19 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
}
/* Cgroup1: threshold notifications & softlimit tree updates */
+
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_NTARGETS,
+};
+
struct memcg1_events_percpu {
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
@@ -499,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
+ count_memcg_events(memcg, PGPGIN, 1);
else {
- __count_memcg_events(memcg, PGPGOUT, 1);
+ count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
@@ -568,8 +581,59 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
local_irq_restore(flags);
}
-void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
+/**
+ * memcg1_swapout - transfer a memsw charge to swap
+ * @folio: folio whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @folio to @entry.
+ */
+void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
+ struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!do_memsw_account())
+ return;
+
+ memcg = folio_memcg(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+ if (!memcg)
+ return;
+
+ /*
+ * In case the memcg owning these pages has been offlined and doesn't
+ * have an ID allocated to it anymore, charge the closest online
+ * ancestor for the swap instead and transfer the memory+swap charge.
+ */
+ swap_memcg = mem_cgroup_id_get_online(memcg);
+ nr_entries = folio_nr_pages(folio);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
+
+ folio_unqueue_deferred_split(folio);
+ folio->memcg_data = 0;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, nr_entries);
+
+ if (memcg != swap_memcg) {
+ if (!mem_cgroup_is_root(swap_memcg))
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
+ }
+
/*
* Interrupts should be disabled here because the caller holds the
* i_pages lock which is taken with interrupts-off. It is
@@ -581,6 +645,42 @@ void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
+
+ css_put(&memcg->css);
+}
+
+/*
+ * memcg1_swapin - uncharge swap slot
+ * @entry: the first swap entry for which the pages are charged
+ * @nr_pages: number of pages which will be uncharged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+{
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account()) {
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ mem_cgroup_uncharge_swap(entry, nr_pages);
+ }
}
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
@@ -589,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long flags;
local_irq_save(flags);
- __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ count_memcg_events(memcg, PGPGOUT, pgpgout);
__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
memcg1_check_events(memcg, nid);
local_irq_restore(flags);
@@ -899,7 +999,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
@@ -1040,13 +1140,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
} else if (!strcmp(name, "memory.oom_control")) {
pr_warn_once("oom_control is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org"
- " if you depend on this functionality. \n");
+ " if you depend on this functionality.\n");
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
pr_warn_once("pressure_level is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org "
- "if you depend on this functionality. \n");
+ "if you depend on this functionality.\n");
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
@@ -1134,8 +1234,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
- } else
- iter->oom_lock = true;
+ }
+ iter->oom_lock = true;
}
if (failed) {
@@ -1202,7 +1302,7 @@ struct oom_wait_info {
};
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
- unsigned mode, int sync, void *arg)
+ unsigned int mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
@@ -1644,7 +1744,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
unsigned long nr = 0;
enum lru_list lru;
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
+ VM_BUG_ON((unsigned int)nid >= nr_node_ids);
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
@@ -1855,9 +1955,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > MAX_SWAPPINESS)
return -EINVAL;
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
+ pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
+ "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
- else
+ } else
WRITE_ONCE(vm_swappiness, val);
return 0;
@@ -1881,7 +1983,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
pr_warn_once("oom_control is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org if you "
- "depend on this functionality. \n");
+ "depend on this functionality.\n");
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
@@ -2096,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg)
void memcg1_free_events(struct mem_cgroup *memcg)
{
- if (memcg->events_percpu)
- free_percpu(memcg->events_percpu);
+ free_percpu(memcg->events_percpu);
}
static int __init memcg1_init(void)
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 0e3b82951d91..6358464bb416 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -7,21 +7,6 @@
/* Cgroup v1 and v2 common declarations */
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages);
-
-static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
-{
- if (mem_cgroup_is_root(memcg))
- return 0;
-
- return try_charge_memcg(memcg, gfp_mask, nr_pages);
-}
-
-void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
-
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -37,38 +22,29 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-/* Whether legacy memory+swap accounting is active */
-static bool do_memsw_account(void)
-{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
-}
-
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremented by the number of pages. This counter is used
- * to trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_NTARGETS,
-};
-
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
void drain_all_stock(struct mem_cgroup *root_memcg);
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
-unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
-unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
-unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
int memory_stat_show(struct seq_file *m, void *v);
+void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg);
+
/* Cgroup v1-specific declarations */
#ifdef CONFIG_MEMCG_V1
+/* Whether legacy memory+swap accounting is active */
+static inline bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
+}
+
+unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
+unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
+unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
bool memcg1_alloc_events(struct mem_cgroup *memcg);
void memcg1_free_events(struct mem_cgroup *memcg);
@@ -96,7 +72,6 @@ void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
void memcg1_oom_recover(struct mem_cgroup *memcg);
void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
-void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg);
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
@@ -119,6 +94,7 @@ extern struct cftype mem_cgroup_legacy_files[];
#else /* CONFIG_MEMCG_V1 */
+static inline bool do_memsw_account(void) { return false; }
static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; }
static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
@@ -134,8 +110,6 @@ static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {}
static inline void memcg1_commit_charge(struct folio *folio,
struct mem_cgroup *memcg) {}
-static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {}
-
static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
unsigned long pgpgout,
unsigned long nr_memory, int nid) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3503d12aaf..902da8a9c643 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -129,8 +133,7 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -163,8 +166,16 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ memcg1_account_kmem(memcg, -nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
+ }
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -315,6 +326,7 @@ static const unsigned int memcg_node_stat_items[] = {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+ PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
@@ -431,9 +443,11 @@ static const unsigned int memcg_vm_event_stat[] = {
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
+ PGSCAN_PROACTIVE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSTEAL_PROACTIVE,
PGFAULT,
PGMAJFAULT,
PGREFILL,
@@ -460,6 +474,8 @@ static const unsigned int memcg_vm_event_stat[] = {
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
};
@@ -489,8 +505,8 @@ struct memcg_vmstats_percpu {
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
- struct memcg_vmstats_percpu *parent;
- struct memcg_vmstats *vmstats;
+ struct memcg_vmstats_percpu __percpu *parent_pcpu;
+ struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
@@ -517,7 +533,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -541,60 +557,43 @@ static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
-/*
- * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
- * not rely on this as part of an acquired spinlock_t lock. These functions are
- * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
- * is sufficient.
- */
-static void memcg_stats_lock(void)
-{
- preempt_disable_nested();
- VM_WARN_ON_IRQS_ENABLED();
-}
-
-static void __memcg_stats_lock(void)
-{
- preempt_disable_nested();
-}
-
-static void memcg_stats_unlock(void)
-{
- preempt_enable_nested();
-}
-
-
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+ int cpu)
{
+ struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- int cpu = smp_processor_id();
unsigned int stats_updates;
if (!val)
return;
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
- statc = this_cpu_ptr(memcg->vmstats_percpu);
- for (; statc; statc = statc->parent) {
- stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
- WRITE_ONCE(statc->stats_updates, stats_updates);
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (!in_nmi())
+ css_rstat_updated(&memcg->css, cpu);
+ statc_pcpu = memcg->vmstats_percpu;
+ for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
+ statc = this_cpu_ptr(statc_pcpu);
+ /*
+ * If @memcg is already flushable then all its ancestors are
+ * flushable as well and also there is no need to increase
+ * stats_updates.
+ */
+ if (memcg_vmstats_needs_flush(statc->vmstats))
+ break;
+
+ stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
+ abs(val));
if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
- /*
- * If @memcg is already flush-able, increasing stats_updates is
- * redundant. Avoid the overhead of the atomic update.
- */
- if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(stats_updates,
- &statc->vmstats->stats_updates);
- WRITE_ONCE(statc->stats_updates, 0);
+ stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -602,7 +601,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -611,7 +610,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -684,15 +683,16 @@ static int memcg_state_val_in_pages(int idx, int val)
}
/**
- * __mod_memcg_state - update cgroup memory statistics
+ * mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
int val)
{
int i = memcg_stats_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -700,12 +700,17 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
}
+#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
@@ -722,14 +727,16 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+#endif
-static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
+ int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -737,35 +744,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- /*
- * The caller from rmap relies on disabled preemption because they never
- * update their counter from in-interrupt context. For these two
- * counters we check that the update is never performed from an
- * interrupt context while other caller need to have disabled interrupt.
- */
- __memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- switch (idx) {
- case NR_ANON_MAPPED:
- case NR_FILE_MAPPED:
- case NR_ANON_THPS:
- WARN_ON_ONCE(!in_task());
- break;
- default:
- VM_WARN_ON_IRQS_ENABLED();
- }
- }
+ cpu = get_cpu();
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);
- memcg_stats_unlock();
+
+ put_cpu();
}
/**
@@ -786,7 +777,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
- __mod_memcg_lruvec_state(lruvec, idx, val);
+ mod_memcg_lruvec_state(lruvec, idx, val);
}
void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
@@ -836,15 +827,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
}
/**
- * __count_memcg_events - account VM events in a cgroup
+ * count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
int i = memcg_events_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -852,11 +844,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[i], count);
- memcg_rstat_updated(memcg, count);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->events[i], count);
+ memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);
- memcg_stats_unlock();
+
+ put_cpu();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -869,6 +863,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events[i]);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
@@ -878,6 +873,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events_local[i]);
}
+#endif
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
@@ -1169,8 +1165,11 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
@@ -1385,6 +1384,7 @@ static const struct memory_stat memory_stats[] = {
{ "pgdemote_kswapd", PGDEMOTE_KSWAPD },
{ "pgdemote_direct", PGDEMOTE_DIRECT },
{ "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
+ { "pgdemote_proactive", PGDEMOTE_PROACTIVE },
#ifdef CONFIG_NUMA_BALANCING
{ "pgpromote_success", PGPROMOTE_SUCCESS },
#endif
@@ -1427,6 +1427,7 @@ static int memcg_page_state_output_unit(int item)
case PGDEMOTE_KSWAPD:
case PGDEMOTE_DIRECT:
case PGDEMOTE_KHUGEPAGED:
+ case PGDEMOTE_PROACTIVE:
#ifdef CONFIG_NUMA_BALANCING
case PGPROMOTE_SUCCESS:
#endif
@@ -1442,11 +1443,25 @@ unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
memcg_page_state_output_unit(item);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
{
return memcg_page_state_local(memcg, item) *
memcg_page_state_output_unit(item);
}
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static bool memcg_accounts_hugetlb(void)
+{
+ return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+}
+#else /* CONFIG_HUGETLB_PAGE */
+static bool memcg_accounts_hugetlb(void)
+{
+ return false;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
@@ -1469,7 +1484,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
#ifdef CONFIG_HUGETLB_PAGE
if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ !memcg_accounts_hugetlb())
continue;
#endif
size = memcg_page_state_output(memcg, memory_stats[i].idx);
@@ -1486,10 +1501,12 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_PROACTIVE) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_PROACTIVE) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
@@ -1549,16 +1566,23 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
+ unsigned long memory_failcnt;
lockdep_assert_held(&oom_lock);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]);
+ else
+ memory_failcnt = memcg->memory.failcnt;
+
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
- K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ K((u64)READ_ONCE(memcg->swap.max)),
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
#ifdef CONFIG_MEMCG_V1
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
@@ -1627,7 +1651,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = task_is_dying() || out_of_memory(&oc);
+ ret = out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1721,28 +1745,45 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
+#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
- local_lock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
+ local_trylock_t lock;
+ uint8_t nr_pages[NR_MEMCG_STOCK];
+ struct mem_cgroup *cached[NR_MEMCG_STOCK];
+ struct work_struct work;
+ unsigned long flags;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
+};
+
+struct obj_stock_pcp {
+ local_trylock_t lock;
+ unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
- unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+
+static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
};
+
static DEFINE_MUTEX(percpu_charge_mutex);
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static void drain_obj_stock(struct obj_stock_pcp *stock);
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
/**
@@ -1750,110 +1791,188 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
- * The charges will only happen if @memcg matches the current cpu's memcg
- * stock, and at least @nr_pages are available in that stock. Failure to
- * service an allocation will refill the stock.
+ * Consume the cached charge if enough nr_pages are present otherwise return
+ * failure. Also return failure for charge request larger than
+ * MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
- unsigned long flags;
+ uint8_t stock_pages;
bool ret = false;
+ int i;
- if (nr_pages > MEMCG_CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock))
return ret;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
-
stock = this_cpu_ptr(&memcg_stock);
- stock_pages = READ_ONCE(stock->nr_pages);
- if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
- ret = true;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ if (memcg != READ_ONCE(stock->cached[i]))
+ continue;
+
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
+ if (stock_pages >= nr_pages) {
+ WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+ ret = true;
+ }
+ break;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&memcg_stock.lock);
return ret;
}
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
- unsigned int stock_pages = READ_ONCE(stock->nr_pages);
- struct mem_cgroup *old = READ_ONCE(stock->cached);
+ struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+ uint8_t stock_pages;
if (!old)
return;
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
-
- WRITE_ONCE(stock->nr_pages, 0);
+ memcg_uncharge(old, stock_pages);
+ WRITE_ONCE(stock->nr_pages[i], 0);
}
css_put(&old->css);
- WRITE_ONCE(stock->cached, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i)
+ drain_stock(stock, i);
}
-static void drain_local_stock(struct work_struct *dummy)
+static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
- /*
- * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
- * drain_stock races is that we always operate on local CPU stock
- * here with IRQ disabled
- */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&memcg_stock.lock);
stock = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(stock);
- drain_stock(stock);
+ drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ local_unlock(&memcg_stock.lock);
}
-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_local_obj_stock(struct work_struct *dummy)
+{
+ struct obj_stock_pcp *stock;
+
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&obj_stock.lock);
+
+ stock = this_cpu_ptr(&obj_stock);
+ drain_obj_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+
+ local_unlock(&obj_stock.lock);
+}
+
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
+ struct mem_cgroup *cached;
+ uint8_t stock_pages;
+ bool success = false;
+ int empty_slot = -1;
+ int i;
+
+ /*
+ * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+ * decide to increase it more than 127 then we will need more careful
+ * handling of nr_pages[] in struct memcg_stock_pcp.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
+ VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
+
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock)) {
+ /*
+ * In case of larger than batch refill or unlikely failure to
+ * lock the percpu memcg_stock.lock, uncharge memcg directly.
+ */
+ memcg_uncharge(memcg, nr_pages);
+ return;
+ }
stock = this_cpu_ptr(&memcg_stock);
- if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
- drain_stock(stock);
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ cached = READ_ONCE(stock->cached[i]);
+ if (!cached && empty_slot == -1)
+ empty_slot = i;
+ if (memcg == READ_ONCE(stock->cached[i])) {
+ stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+ WRITE_ONCE(stock->nr_pages[i], stock_pages);
+ if (stock_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock, i);
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ i = empty_slot;
+ if (i == -1) {
+ i = get_random_u32_below(NR_MEMCG_STOCK);
+ drain_stock(stock, i);
+ }
css_get(&memcg->css);
- WRITE_ONCE(stock->cached, memcg);
+ WRITE_ONCE(stock->cached[i], memcg);
+ WRITE_ONCE(stock->nr_pages[i], nr_pages);
}
- stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
- WRITE_ONCE(stock->nr_pages, stock_pages);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ local_unlock(&memcg_stock.lock);
}
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
{
- unsigned long flags;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ int i;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- __refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ rcu_read_lock();
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ memcg = READ_ONCE(stock->cached[i]);
+ if (!memcg)
+ continue;
+
+ if (READ_ONCE(stock->nr_pages[i]) &&
+ mem_cgroup_is_descendant(memcg, root_memcg)) {
+ flush = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return flush;
}
/*
@@ -1876,25 +1995,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *memcg;
- bool flush = false;
+ struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
+ struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- rcu_read_lock();
- memcg = READ_ONCE(stock->cached);
- if (memcg && READ_ONCE(stock->nr_pages) &&
- mem_cgroup_is_descendant(memcg, root_memcg))
- flush = true;
- else if (obj_stock_flush_required(stock, root_memcg))
- flush = true;
- rcu_read_unlock();
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
+ is_memcg_drain_needed(memcg_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &memcg_st->flags)) {
+ if (cpu == curcpu)
+ drain_local_memcg_stock(&memcg_st->work);
+ else if (!cpu_is_isolated(cpu))
+ schedule_work_on(cpu, &memcg_st->work);
+ }
- if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
+ obj_stock_flush_required(obj_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &obj_st->flags)) {
if (cpu == curcpu)
- drain_local_stock(&stock->work);
+ drain_local_obj_stock(&obj_st->work);
else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &stock->work);
+ schedule_work_on(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -1903,10 +2024,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- struct memcg_stock_pcp *stock;
-
- stock = &per_cpu(memcg_stock, cpu);
- drain_stock(stock);
+ /* no need for the local lock */
+ drain_obj_stock(&per_cpu(obj_stock, cpu));
+ drain_stock_fully(&per_cpu(memcg_stock, cpu));
return 0;
}
@@ -2181,8 +2301,8 @@ out:
css_put(&memcg->css);
}
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
@@ -2199,6 +2319,10 @@ retry:
if (consume_stock(memcg, nr_pages))
return 0;
+ if (!gfpflags_allow_spinning(gfp_mask))
+ /* Avoid the refill and flush of the older stock */
+ batch = nr_pages;
+
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2371,19 +2495,13 @@ done_restock:
return 0;
}
-/**
- * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
- * @memcg: memcg previously charged.
- * @nr_pages: number of pages previously charged.
- */
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
- return;
+ return 0;
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
@@ -2399,29 +2517,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-/**
- * mem_cgroup_commit_charge - commit a previously successful try_charge().
- * @folio: folio to commit the charge to.
- * @memcg: memcg previously charged.
- */
-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
{
- css_get(&memcg->css);
- commit_charge(folio, memcg);
- memcg1_commit_charge(folio, memcg);
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
-static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2546,6 +2682,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2608,6 +2747,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2620,9 +2776,10 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ refill_stock(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2647,7 +2804,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -2655,6 +2812,23 @@ out:
return ret;
}
+static struct obj_cgroup *page_objcg(const struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ if (mem_cgroup_disabled() || !memcg_data)
+ return NULL;
+
+ VM_BUG_ON_PAGE((memcg_data & OBJEXTS_FLAGS_MASK) != MEMCG_DATA_KMEM,
+ page);
+ return (struct obj_cgroup *)(memcg_data - MEMCG_DATA_KMEM);
+}
+
+static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
+{
+ page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
+}
+
/**
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
@@ -2673,8 +2847,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
obj_cgroup_get(objcg);
- page->memcg_data = (unsigned long)objcg |
- MEMCG_DATA_KMEM;
+ page_set_objcg(page, objcg);
return 0;
}
}
@@ -2688,53 +2861,38 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct folio *folio = page_folio(page);
- struct obj_cgroup *objcg;
+ struct obj_cgroup *objcg = page_objcg(page);
unsigned int nr_pages = 1 << order;
- if (!folio_memcg_kmem(folio))
+ if (!objcg)
return;
- objcg = __folio_objcg(folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
- folio->memcg_data = 0;
+ page->memcg_data = 0;
obj_cgroup_put(objcg);
}
-static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static void __account_obj_stock(struct obj_cgroup *objcg,
+ struct obj_stock_pcp *stock, int nr,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
- * changes.
+ * accumulating over a page of vmstat data or when pgdat changes.
*/
- if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
- stock->cached_pgdat = pgdat;
- } else if (stock->cached_pgdat != pgdat) {
+ if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -2760,37 +2918,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- __mod_objcg_mlstate(objcg, pgdat, idx, nr);
-
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
}
-static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock))
+ return ret;
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
+
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&obj_stock.lock);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock_pcp *stock)
{
struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return NULL;
+ return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -2803,7 +2962,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- __refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2827,13 +2987,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -2842,67 +3002,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
WRITE_ONCE(stock->cached_objcg, NULL);
- /*
- * The `old' objects needs to be released by the caller via
- * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
- */
- return old;
+ obj_cgroup_put(old);
}
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
if (objcg) {
memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
+ flush = true;
}
+ rcu_read_unlock();
- return false;
+ return flush;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
+ enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock)) {
+ if (pgdat)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = drain_obj_stock(stock);
+ drain_obj_stock(stock);
obj_cgroup_get(objcg);
- WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
+
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
-
+ local_unlock(&obj_stock.lock);
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
-int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
unsigned int nr_pages, nr_bytes;
int ret;
- if (consume_obj_stock(objcg, size))
+ if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
return 0;
/*
@@ -2935,15 +3104,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
- if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
+ if (!ret && (nr_bytes || pgdat))
+ refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
+ false, size, pgdat, idx);
return ret;
}
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
+}
+
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size, true);
+ refill_obj_stock(objcg, size, true, 0, NULL, 0);
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -2995,23 +3170,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
}
- if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
- return false;
-
for (i = 0; i < size; i++) {
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
+ /*
+ * if we fail and size is 1, memcg_alloc_abort_single() will
+ * just free the object, which is ok as we have not assigned
+ * objcg to its obj_ext yet
+ *
+ * for larger sizes, kmem_cache_free_bulk() will uncharge
+ * any objects that were already charged and obj_ext assigned
+ *
+ * TODO: we could batch this until slab_pgdat(slab) changes
+ * between iterations, with a more complicated undo
+ */
+ if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
+ slab_pgdat(slab), cache_vmstat_idx(s)))
+ return false;
+
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
}
return true;
@@ -3020,6 +3204,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
{
+ size_t obj_size = obj_full_size(s);
+
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
@@ -3030,33 +3216,40 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
continue;
obj_exts[off].objcg = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
+ refill_obj_stock(objcg, obj_size, true, -obj_size,
+ slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
}
}
/*
- * Because folio_memcg(head) is not set on tails, set it now.
+ * The objcg is only set on the first page, so transfer it to all the
+ * other pages.
*/
-void split_page_memcg(struct page *head, int old_order, int new_order)
+void split_page_memcg(struct page *page, unsigned order)
{
- struct folio *folio = page_folio(head);
- int i;
- unsigned int old_nr = 1 << old_order;
- unsigned int new_nr = 1 << new_order;
+ struct obj_cgroup *objcg = page_objcg(page);
+ unsigned int i, nr = 1 << order;
- if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ if (!objcg)
return;
- for (i = new_nr; i < old_nr; i += new_nr)
- folio_page(folio, i)->memcg_data = folio->memcg_data;
+ for (i = 1; i < nr; i++)
+ page_set_objcg(&page[i], objcg);
- if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
- else
- css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
+ obj_cgroup_get_many(objcg, nr - 1);
+}
+
+void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
+ unsigned new_order)
+{
+ unsigned new_refs;
+
+ if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ return;
+
+ new_refs = (1 << (old_order - new_order)) - 1;
+ css_get_many(&__folio_memcg(folio)->css, new_refs);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3384,7 +3577,7 @@ void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
refcount_add(n, &memcg->id.ref);
}
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
@@ -3399,6 +3592,24 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
mem_cgroup_id_put_many(memcg, 1);
}
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+{
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
+ /*
+ * The root cgroup cannot be destroyed, so it's refcount must
+ * always be >= 1.
+ */
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
+ VM_BUG_ON(1);
+ break;
+ }
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ }
+ return memcg;
+}
+
/**
* mem_cgroup_from_id - look up a memcg from a memcg id
* @id: the memcg id to look up
@@ -3434,11 +3645,22 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
}
#endif
+static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn)
+{
+ if (!pn)
+ return;
+
+ free_percpu(pn->lruvec_stats_percpu);
+ kfree(pn->lruvec_stats);
+ kfree(pn);
+}
+
static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!pn)
return false;
@@ -3458,23 +3680,10 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
memcg->nodeinfo[node] = pn;
return true;
fail:
- kfree(pn->lruvec_stats);
- kfree(pn);
+ free_mem_cgroup_per_node_info(pn);
return false;
}
-static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
-{
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
-
- if (!pn)
- return;
-
- free_percpu(pn->lruvec_stats_percpu);
- kfree(pn->lruvec_stats);
- kfree(pn);
-}
-
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
@@ -3482,7 +3691,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
obj_cgroup_put(memcg->orig_objcg);
for_each_node(node)
- free_mem_cgroup_per_node_info(memcg, node);
+ free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3498,13 +3707,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
- struct memcg_vmstats_percpu *statc, *pstatc;
+ struct memcg_vmstats_percpu *statc;
+ struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
long error;
- memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+ memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg)
return ERR_PTR(-ENOMEM);
@@ -3529,9 +3739,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for_each_possible_cpu(cpu) {
if (parent)
- pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent = parent ? pstatc : NULL;
+ statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
}
@@ -3575,6 +3785,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg, *old_memcg;
+ bool memcg_on_dfl = cgroup_subsys_on_dfl(memory_cgrp_subsys);
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc(parent);
@@ -3592,9 +3803,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
- page_counter_init(&memcg->memory, &parent->memory, true);
+ page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
+ memcg->memory.track_failcnt = !memcg_on_dfl;
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
page_counter_init(&memcg->kmem, &parent->kmem, false);
page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
@@ -3612,7 +3824,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ if (memcg_on_dfl && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
if (!cgroup_memory_nobpf)
@@ -3803,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3811,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -3860,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4014,7 +4275,7 @@ static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
WRITE_ONCE(peer_ctx->value, usage);
/* initial write, register watcher */
- if (ofp->value == -1)
+ if (ofp->value == OFP_PEAK_UNSET)
list_add(&ofp->list, watchers);
WRITE_ONCE(ofp->value, usage);
@@ -4102,6 +4363,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -4124,7 +4388,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (!reclaimed && !nr_retries--)
break;
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4151,6 +4415,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
xchg(&memcg->memory.max, max);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -4176,8 +4443,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
+ cond_resched();
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4300,11 +4568,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
enum {
MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
MEMORY_RECLAIM_NULL,
};
static const match_table_t tokens = {
{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
{ MEMORY_RECLAIM_NULL, NULL },
};
@@ -4338,6 +4608,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
return -EINVAL;
break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
default:
return -EINVAL;
}
@@ -4498,7 +4771,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
if (ret)
goto out;
- mem_cgroup_commit_charge(folio, memcg);
+ css_get(&memcg->css);
+ commit_charge(folio, memcg);
+ memcg1_commit_charge(folio, memcg);
out:
return ret;
}
@@ -4516,38 +4791,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
- * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
- * @memcg: memcg to charge.
- * @gfp: reclaim mode.
- * @nr_pages: number of pages to charge.
- *
- * This function is called when allocating a huge page folio to determine if
- * the memcg has the capacity for it. It does not commit the charge yet,
- * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
+ * @folio: folio being charged
+ * @gfp: reclaim mode
*
- * Once we have obtained the hugetlb folio, we can call
- * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
- * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
- * of try_charge().
+ * This function is called when allocating a huge page folio, after the page has
+ * already been obtained and charged to the appropriate hugetlb cgroup
+ * controller (if it is enabled).
*
- * Returns 0 on success. Otherwise, an error code is returned.
+ * Returns ENOMEM if the memcg is already full.
+ * Returns 0 if either the charge was successful, or if we skip the charging.
*/
-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
- long nr_pages)
+int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_current();
+ int ret = 0;
+
/*
- * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
- * but do not attempt to commit charge later (or cancel on error) either.
+ * Even memcg does not account for hugetlb, we still want to update
+ * system-level stats via lruvec_stat_mod_folio. Return 0, and skip
+ * charging the memcg.
*/
- if (mem_cgroup_disabled() || !memcg ||
- !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
- return -EOPNOTSUPP;
+ if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() ||
+ !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ goto out;
- if (try_charge(memcg, gfp, nr_pages))
- return -ENOMEM;
+ if (charge_memcg(folio, memcg, gfp))
+ ret = -ENOMEM;
- return 0;
+out:
+ mem_cgroup_put(memcg);
+ return ret;
}
/**
@@ -4585,40 +4859,6 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
return ret;
}
-/*
- * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
- * @entry: the first swap entry for which the pages are charged
- * @nr_pages: number of pages which will be uncharged
- *
- * Call this function after successfully adding the charged page to swapcache.
- *
- * Note: This function assumes the page for which swap slot is being uncharged
- * is order 0 page.
- */
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
-{
- /*
- * Cgroup1's unified memory+swap counter has been charged with the
- * new swapcache page, finish the transfer by uncharging the swap
- * slot. The swap slot would also get uncharged when it dies, but
- * it can stick around indefinitely and we'd count the page twice
- * the entire time.
- *
- * Cgroup2 has separate resource counters for memory and swap,
- * so this is a non-issue here. Memory and swap charge lifetimes
- * correspond 1:1 to page and swap slot lifetimes: we charge the
- * page to memory here, and uncharge swap when the slot is freed.
- */
- if (!mem_cgroup_disabled() && do_memsw_account()) {
- /*
- * The swap entry might not get freed for a long time,
- * let's not wait for it. The page already received a
- * memory+swap charge, drop the swap entry duplicate.
- */
- mem_cgroup_uncharge_swap(entry, nr_pages);
- }
-}
-
struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_memory;
@@ -4635,9 +4875,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
- if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ memcg_uncharge(ug->memcg, ug->nr_memory);
if (ug->nr_kmem) {
mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
@@ -4869,7 +5107,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
- if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+ if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
}
@@ -4913,15 +5151,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory);
/*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
{
+ unsigned int memcg_size;
int cpu;
/*
@@ -4935,92 +5174,24 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
-
- return 0;
-}
-subsys_initcall(mem_cgroup_init);
-
-#ifdef CONFIG_SWAP
-static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
-{
- while (!refcount_inc_not_zero(&memcg->id.ref)) {
- /*
- * The root cgroup cannot be destroyed, so it's refcount must
- * always be >= 1.
- */
- if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
- VM_BUG_ON(1);
- break;
- }
- memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
+ drain_local_memcg_stock);
+ INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
+ drain_local_obj_stock);
}
- return memcg;
-}
-
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @folio: folio whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @folio to @entry.
- */
-void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
-{
- struct mem_cgroup *memcg, *swap_memcg;
- unsigned int nr_entries;
- unsigned short oldid;
-
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
-
- if (mem_cgroup_disabled())
- return;
-
- if (!do_memsw_account())
- return;
-
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
- return;
-
- /*
- * In case the memcg owning these pages has been offlined and doesn't
- * have an ID allocated to it anymore, charge the closest online
- * ancestor for the swap instead and transfer the memory+swap charge.
- */
- swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = folio_nr_pages(folio);
- /* Get references for the tail pages, too */
- if (nr_entries > 1)
- mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
- nr_entries);
- VM_BUG_ON_FOLIO(oldid, folio);
- mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- folio_unqueue_deferred_split(folio);
- folio->memcg_data = 0;
+ memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+ memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, nr_entries);
+ memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN);
- if (memcg != swap_memcg) {
- if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, nr_entries);
- page_counter_uncharge(&memcg->memsw, nr_entries);
- }
-
- memcg1_swapout(folio, memcg);
- css_put(&memcg->css);
+ return 0;
}
+#ifdef CONFIG_SWAP
/**
* __mem_cgroup_try_charge_swap - try charging swap space for a folio
* @folio: folio being added to swap
@@ -5035,7 +5206,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
- unsigned short oldid;
if (do_memsw_account())
return 0;
@@ -5064,10 +5234,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
/* Get references for the tail pages, too */
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+
return 0;
}
@@ -5081,7 +5251,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- id = swap_cgroup_record(entry, 0, nr_pages);
+ id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
@@ -5473,3 +5643,8 @@ static int __init mem_cgroup_swap_init(void)
subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/memfd.c b/mm/memfd.c
index c17c3ea701a1..ab367e61553d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -20,6 +20,7 @@
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
+#include "swap.h"
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
@@ -259,7 +260,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
/*
- * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
*/
if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
@@ -327,15 +328,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags)
return 0;
}
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
+static inline bool is_write_sealed(unsigned int seals)
{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
+ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+static int check_write_seal(unsigned long *vm_flags_ptr)
+{
+ unsigned long vm_flags = *vm_flags_ptr;
+ unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+
+ /* If a private mapping then writability is irrelevant. */
+ if (!(mask & VM_SHARED))
+ return 0;
+
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * write seals are active.
+ */
+ if (mask & VM_WRITE)
+ return -EPERM;
+
+ /*
+ * This is a read-only mapping, disallow mprotect() from making a
+ * write-sealed mapping writable in future.
+ */
+ *vm_flags_ptr &= ~VM_MAYWRITE;
+
+ return 0;
+}
+
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+{
+ int err = 0;
+ unsigned int *seals_ptr = memfd_file_seals_ptr(file);
+ unsigned int seals = seals_ptr ? *seals_ptr : 0;
+
+ if (is_write_sealed(seals))
+ err = check_write_seal(vm_flags_ptr);
+
+ return err;
+}
+
+static int sanitize_flags(unsigned int *flags_ptr)
+{
+ unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
@@ -351,50 +388,52 @@ SYSCALL_DEFINE2(memfd_create,
if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
return -EINVAL;
- error = check_sysctl_memfd_noexec(&flags);
- if (error < 0)
- return error;
+ return check_sysctl_memfd_noexec(flags_ptr);
+}
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
+static char *alloc_name(const char __user *uname)
+{
+ int error;
+ char *name;
+ long len;
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ /* returned length does not include terminating zero */
+ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
+ if (len < 0) {
error = -EFAULT;
goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
+ } else if (len > MFD_NAME_MAX_LEN) {
+ error = -EINVAL;
goto err_name;
}
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
+ return name;
+
+err_name:
+ kfree(name);
+ return ERR_PTR(error);
+}
+
+static struct file *alloc_file(const char *name, unsigned int flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
- } else
+ } else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
}
+ if (IS_ERR(file))
+ return file;
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
@@ -414,6 +453,37 @@ SYSCALL_DEFINE2(memfd_create,
*file_seals &= ~F_SEAL_SEAL;
}
+ return file;
+}
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ struct file *file;
+ int fd, error;
+ char *name;
+
+ error = sanitize_flags(&flags);
+ if (error < 0)
+ return error;
+
+ name = alloc_name(uname);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ file = alloc_file(name, flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+
fd_install(fd, file);
kfree(name);
return fd;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a7b8ccd29b6f..b91a33fb6c69 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};
-static struct ctl_table memory_failure_table[] = {
+static const struct ctl_table memory_failure_table[] = {
{
.procname = "memory_failure_early_kill",
.data = &sysctl_memory_failure_early_kill,
@@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
- if (pud_devmap(*pud))
+ if (pud_trans_huge(*pud))
return PUD_SHIFT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
- if (pmd_devmap(*pmd))
+ if (pmd_trans_huge(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
if (!pte)
return 0;
ptent = ptep_get(pte);
- if (pte_present(ptent) && pte_devmap(ptent))
+ if (pte_present(ptent))
ret = PAGE_SHIFT;
pte_unmap(pte);
return ret;
@@ -881,12 +881,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
(void *)&priv);
+ /*
+ * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
+ * succeeds or fails, then kill the process with SIGBUS.
+ * ret = 0 when poison page is a clean page and it's dropped, no
+ * SIGBUS is needed.
+ */
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
- else
- ret = 0;
mmap_read_unlock(p->mm);
- return ret > 0 ? -EHWPOISON : -EFAULT;
+
+ return ret > 0 ? -EHWPOISON : 0;
}
/*
@@ -1556,11 +1561,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- struct address_space *mapping;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1572,7 +1601,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1580,6 +1609,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1589,8 +1620,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1613,29 +1642,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1643,9 +1649,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
@@ -2211,9 +2215,13 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks held.
*
- * Return: 0 for successfully handled the memory error,
- * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
- * < 0(except -EOPNOTSUPP) on failure.
+ * Return:
+ * 0 - success,
+ * -ENXIO - memory not managed by the kernel
+ * -EOPNOTSUPP - hwpoison_filter() filtered the error event,
+ * -EHWPOISON - the page was already poisoned, potentially
+ * kill process,
+ * other negative values - failure.
*/
int memory_failure(unsigned long pfn, int flags)
{
diff --git a/mm/memory.c b/mm/memory.c
index 75c2dfd04f72..b0cda5aab398 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -94,14 +94,6 @@
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
-#ifndef CONFIG_NUMA
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
-
-struct page *mem_map;
-EXPORT_SYMBOL(mem_map);
-#endif
-
static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);
@@ -121,14 +113,6 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
}
/*
- * A number of key systems in x86 including ioremap() rely on the assumption
- * that high_memory defines the upper bound on direct map memory, then end
- * of ZONE_NORMAL.
- */
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-
-/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
@@ -294,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
p4d_free_tlb(tlb, p4d, start);
}
-/*
- * This function frees user-level page tables of a process.
+/**
+ * free_pgd_range - Unmap and free page tables in the range
+ * @tlb: the mmu_gather containing pending TLB flush info
+ * @addr: virtual address start
+ * @end: virtual address end
+ * @floor: lowest address boundary
+ * @ceiling: highest address boundary
+ *
+ * This function tears down all user-level page tables in the
+ * specified virtual address range [@addr..@end). It is part of
+ * the memory unmap flow.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -365,6 +358,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -534,10 +529,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -715,42 +711,53 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
}
#endif
+/**
+ * restore_exclusive_pte - Restore a device-exclusive entry
+ * @vma: VMA covering @address
+ * @folio: the mapped folio
+ * @page: the mapped folio page
+ * @address: the virtual address
+ * @ptep: pte pointer into the locked page table mapping the folio page
+ * @orig_pte: pte value at @ptep
+ *
+ * Restore a device-exclusive non-swap entry to an ordinary present pte.
+ *
+ * The folio and the page table must be locked, and MMU notifiers must have
+ * been called to invalidate any (exclusive) device mappings.
+ *
+ * Locking the folio makes sure that anybody who just converted the pte to
+ * a device-exclusive entry can map it into the device to make forward
+ * progress without others converting it back until the folio was unlocked.
+ *
+ * If the folio lock ever becomes an issue, we can stop relying on the folio
+ * lock; it might make some scenarios with heavy thrashing less likely to
+ * make forward progress, but these scenarios might not be valid use cases.
+ *
+ * Note that the folio lock does not protect against all cases of concurrent
+ * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
+ * must use MMU notifiers to sync against any concurrent changes.
+ */
static void restore_exclusive_pte(struct vm_area_struct *vma,
- struct page *page, unsigned long address,
- pte_t *ptep)
+ struct folio *folio, struct page *page, unsigned long address,
+ pte_t *ptep, pte_t orig_pte)
{
- struct folio *folio = page_folio(page);
- pte_t orig_pte;
pte_t pte;
- swp_entry_t entry;
- orig_pte = ptep_get(ptep);
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(orig_pte);
if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
- else if (is_writable_device_exclusive_entry(entry))
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-
- VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
- PageAnonExclusive(page)), folio);
-
- /*
- * No need to take a page reference as one was already
- * created when the swap entry was made.
- */
- if (folio_test_anon(folio))
- folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
- else
- /*
- * Currently device exclusive access only supports anonymous
- * memory so the entry shouldn't point to a filebacked page.
- */
- WARN_ON_ONCE(1);
+ if ((vma->vm_flags & VM_WRITE) &&
+ can_change_pte_writable(vma, address, pte)) {
+ if (folio_test_dirty(folio))
+ pte = pte_mkdirty(pte);
+ pte = pte_mkwrite(pte, vma);
+ }
set_pte_at(vma->vm_mm, address, ptep, pte);
/*
@@ -764,16 +771,15 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
* Tries to restore an exclusive pte if the page lock can be acquired without
* sleeping.
*/
-static int
-try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr)
+static int try_restore_exclusive_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
- swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+ struct folio *folio = page_folio(page);
- if (trylock_page(page)) {
- restore_exclusive_pte(vma, page, addr, src_pte);
- unlock_page(page);
+ if (folio_trylock(folio)) {
+ restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
+ folio_unlock(folio);
return 0;
}
@@ -853,7 +859,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
folio_get(folio);
rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
- folio_try_dup_anon_rmap_pte(folio, page, src_vma);
+ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
/*
* We do not preserve soft-dirty information, because so
@@ -879,7 +885,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* (ie. COW) mappings.
*/
VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
- if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
@@ -935,7 +941,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
+ pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
@@ -1007,14 +1013,14 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
- nr, src_vma))) {
+ nr, dst_vma, src_vma))) {
folio_ref_sub(folio, nr);
return -EAGAIN;
}
rss[MM_ANONPAGES] += nr;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_ptes(folio, page, nr);
+ folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
if (any_writable)
@@ -1032,7 +1038,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
@@ -1042,7 +1048,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_pte(folio, page);
+ folio_dup_file_rmap_pte(folio, page, dst_vma);
rss[mm_counter_file(folio)]++;
}
@@ -1362,12 +1368,12 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
- unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
+ unsigned long next;
bool is_cow;
int ret;
@@ -1377,16 +1383,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
- if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- /*
- * We do not free on error cases below as remove_vma
- * gets called on error from higher level routine
- */
- ret = track_pfn_copy(src_vma);
- if (ret)
- return ret;
- }
-
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
@@ -1419,7 +1415,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
- untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1436,7 +1431,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
- if (!details)
+ if (!details || details->reclaim_pt)
return true;
/* Or, we zap COWed pages only if the caller wants to */
@@ -1466,34 +1461,42 @@ static inline bool zap_drop_markers(struct zap_details *details)
/*
* This function makes sure that we'll replace the none pte with an uffd-wp
* swap special pte marker when necessary. Must be with the pgtable lock held.
+ *
+ * Returns true if uffd-wp ptes was installed, false otherwise.
*/
-static inline void
+static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
+ bool was_installed = false;
+
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
- return;
+ return false;
if (zap_drop_markers(details))
- return;
+ return false;
for (;;) {
/* the PFN in the PTE is irrelevant. */
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
+ was_installed = true;
if (--nr == 0)
break;
pte++;
addr += PAGE_SIZE;
}
+#endif
+ return was_installed;
}
static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct folio *folio,
struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
unsigned long addr, struct zap_details *details, int *rss,
- bool *force_flush, bool *force_break)
+ bool *force_flush, bool *force_break, bool *any_skipped)
{
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
@@ -1519,8 +1522,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entries(tlb, pte, nr, addr);
if (unlikely(userfaultfd_pte_wp(vma, ptent)))
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
- ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
+ nr, details, ptent);
if (!delay_rmap) {
folio_remove_rmap_ptes(folio, page, nr, vma);
@@ -1544,7 +1547,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *force_flush,
- bool *force_break)
+ bool *force_break, bool *any_skipped)
{
const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
@@ -1559,15 +1562,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
if (userfaultfd_pte_wp(vma, ptent))
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
- details, ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
+ pte, 1, details, ptent);
ksm_might_unmap_zero_page(mm, ptent);
return 1;
}
folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
+ if (unlikely(!should_zap_folio(details, folio))) {
+ *any_skipped = true;
return 1;
+ }
/*
* Make sure that the common "small folio" case is as fast as possible
@@ -1579,14 +1584,120 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
- force_break);
+ force_break, any_skipped);
return nr;
}
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
- details, rss, force_flush, force_break);
+ details, rss, force_flush, force_break, any_skipped);
return 1;
}
+static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *any_skipped)
+{
+ swp_entry_t entry;
+ int nr = 1;
+
+ *any_skipped = true;
+ entry = pte_to_swp_entry(ptent);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(page);
+
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
+ rss[mm_counter(folio)]--;
+ folio_remove_rmap_pte(folio, page, vma);
+ folio_put(folio);
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entries, hence a private anon pages */
+ if (!should_zap_cows(details))
+ return 1;
+
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ } else if (is_migration_entry(entry)) {
+ struct folio *folio = pfn_swap_entry_folio(entry);
+
+ if (!should_zap_folio(details, folio))
+ return 1;
+ rss[mm_counter(folio)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
+ return 1;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
+ return 1;
+ } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ if (!should_zap_cows(details))
+ return 1;
+ } else {
+ /* We should have covered all the swap entry types */
+ pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ WARN_ON_ONCE(1);
+ }
+ clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+
+ return nr;
+}
+
+static inline int do_zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break,
+ bool *any_skipped)
+{
+ pte_t ptent = ptep_get(pte);
+ int max_nr = (end - addr) / PAGE_SIZE;
+ int nr = 0;
+
+ /* Skip all consecutive none ptes */
+ if (pte_none(ptent)) {
+ for (nr = 1; nr < max_nr; nr++) {
+ ptent = ptep_get(pte + nr);
+ if (!pte_none(ptent))
+ break;
+ }
+ max_nr -= nr;
+ if (!max_nr)
+ return nr;
+ pte += nr;
+ addr += nr * PAGE_SIZE;
+ }
+
+ if (pte_present(ptent))
+ nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, force_flush, force_break,
+ any_skipped);
+ else
+ nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, any_skipped);
+
+ return nr;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1598,9 +1709,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+ pmd_t pmdval;
+ unsigned long start = addr;
+ bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
+ bool direct_reclaim = true;
int nr;
+retry:
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1610,90 +1725,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = ptep_get(pte);
- struct folio *folio;
- struct page *page;
- int max_nr;
+ bool any_skipped = false;
- nr = 1;
- if (pte_none(ptent))
- continue;
-
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
-
- if (pte_present(ptent)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
- addr, details, rss, &force_flush,
- &force_break);
- if (unlikely(force_break)) {
- addr += nr * PAGE_SIZE;
- break;
- }
- continue;
}
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
- continue;
- /*
- * Both device private/exclusive mappings should only
- * work with anonymous page so far, so we don't need to
- * consider uffd-wp bit when zap. For more information,
- * see zap_install_uffd_wp_if_needed().
- */
- WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
- folio_put(folio);
- } else if (!non_swap_entry(entry)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = swap_pte_batch(pte, max_nr, ptent);
- /* Genuine swap entries, hence a private anon pages */
- if (!should_zap_cows(details))
- continue;
- rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
- if (!should_zap_folio(details, folio))
- continue;
- rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
- /*
- * For anon: always drop the marker; for file: only
- * drop the marker if explicitly requested.
- */
- if (!vma_is_anonymous(vma) &&
- !zap_drop_markers(details))
- continue;
- } else if (is_guard_swp_entry(entry)) {
- /*
- * Ordinary zapping should not remove guard PTE
- * markers. Only do so if we should remove PTE markers
- * in general.
- */
- if (!zap_drop_markers(details))
- continue;
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
- if (!should_zap_cows(details))
- continue;
- } else {
- /* We should have covered all the swap entry types */
- pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
- WARN_ON_ONCE(1);
+ nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
+ &force_flush, &force_break, &any_skipped);
+ if (any_skipped)
+ can_reclaim_pt = false;
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
+ break;
}
- clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
+ direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1713,6 +1773,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (force_flush)
tlb_flush_mmu(tlb);
+ if (addr != end) {
+ cond_resched();
+ force_flush = false;
+ force_break = false;
+ goto retry;
+ }
+
+ if (can_reclaim_pt) {
+ if (direct_reclaim)
+ free_pte(mm, start, tlb, pmdval);
+ else
+ try_to_free_pte(mm, pmd, start, tlb);
+ }
+
return addr;
}
@@ -1729,7 +1803,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
addr = next;
continue;
@@ -1844,9 +1918,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0, mm_wr_locked);
-
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
@@ -1920,36 +1991,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * zap_page_range_single_batched - remove user pages in a given range
+ * @tlb: pointer to the caller's struct mmu_gather
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
+ * @address: starting address of pages to remove
+ * @size: number of bytes to remove
* @details: details of shared cache invalidation
*
- * The range must fit into one VMA.
+ * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
+ * hugetlb, @tlb is flushed and re-initialized by this function.
*/
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
const unsigned long end = address + size;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
- lru_add_drain();
+ VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
+
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
- tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
/*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
+ if (is_vm_hugetlb_page(vma)) {
+ /*
+ * flush tlb and free resources before hugetlb_zap_end(), to
+ * avoid concurrent page faults' allocation failure.
+ */
+ tlb_finish_mmu(tlb);
+ hugetlb_zap_end(vma, details);
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ }
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ zap_page_range_single_batched(&tlb, vma, address, size, details);
tlb_finish_mmu(&tlb);
- hugetlb_zap_end(vma, details);
}
/**
@@ -2056,19 +2155,39 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
}
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
- unsigned long addr, struct page *page, pgprot_t prot)
+ unsigned long addr, struct page *page,
+ pgprot_t prot, bool mkwrite)
{
struct folio *folio = page_folio(page);
- pte_t pteval;
+ pte_t pteval = ptep_get(pte);
+
+ if (!pte_none(pteval)) {
+ if (!mkwrite)
+ return -EBUSY;
+
+ /* see insert_pfn(). */
+ if (pte_pfn(pteval) != page_to_pfn(page)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
+ return -EFAULT;
+ }
+ pteval = maybe_mkwrite(pteval, vma);
+ pteval = pte_mkyoung(pteval);
+ if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
+ update_mmu_cache(vma, addr, pte);
+ return 0;
+ }
- if (!pte_none(ptep_get(pte)))
- return -EBUSY;
/* Ok, finally just insert the thing.. */
pteval = mk_pte(page, prot);
if (unlikely(is_zero_folio(folio))) {
pteval = pte_mkspecial(pteval);
} else {
folio_get(folio);
+ pteval = mk_pte(page, prot);
+ if (mkwrite) {
+ pteval = pte_mkyoung(pteval);
+ pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
+ }
inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
}
@@ -2077,7 +2196,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
}
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, pgprot_t prot)
+ struct page *page, pgprot_t prot, bool mkwrite)
{
int retval;
pte_t *pte;
@@ -2090,7 +2209,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
+ mkwrite);
pte_unmap_unlock(pte, ptl);
out:
return retval;
@@ -2104,7 +2224,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
err = validate_page_before_insert(vma, page);
if (err)
return err;
- return insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -2240,7 +2360,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
BUG_ON(vma->vm_flags & VM_PFNMAP);
vm_flags_set(vma, VM_MIXEDMAP);
}
- return insert_page(vma, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2435,7 +2555,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
@@ -2498,7 +2618,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
return VM_FAULT_SIGBUS;
@@ -2520,7 +2640,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- err = insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
@@ -2533,6 +2653,26 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write)
+{
+ pgprot_t pgprot = vmf->vma->vm_page_prot;
+ unsigned long addr = vmf->address;
+ int err;
+
+ if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ err = insert_page(vmf->vma, addr, page, pgprot, write);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
@@ -2723,6 +2863,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+ unsigned long size, pgprot_t *prot)
+{
+ struct pfnmap_track_ctx *ctx;
+
+ if (pfnmap_track(pfn, size, prot))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (unlikely(!ctx)) {
+ pfnmap_untrack(pfn, size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ctx->pfn = pfn;
+ ctx->size = size;
+ kref_init(&ctx->kref);
+ return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+ struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+ pfnmap_untrack(ctx->pfn, ctx->size);
+ kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -2735,20 +2905,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
*
* Return: %0 on success, negative error code otherwise.
*/
+#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
+ struct pfnmap_track_ctx *ctx = NULL;
int err;
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
- if (err)
+ size = PAGE_ALIGN(size);
+
+ /*
+ * If we cover the full VMA, we'll perform actual tracking, and
+ * remember to untrack when the last reference to our tracking
+ * context from a VMA goes away. We'll keep tracking the whole pfn
+ * range even during VMA splits and partial unmapping.
+ *
+ * If we only cover parts of the VMA, we'll only setup the cachemode
+ * in the pgprot for the pfn range.
+ */
+ if (addr == vma->vm_start && addr + size == vma->vm_end) {
+ if (vma->pfnmap_track_ctx)
+ return -EINVAL;
+ ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
return -EINVAL;
+ }
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
- if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
+ if (ctx) {
+ if (err)
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ else
+ vma->pfnmap_track_ctx = ctx;
+ }
return err;
}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+}
+#endif
EXPORT_SYMBOL(remap_pfn_range);
/**
@@ -2828,11 +3029,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -2971,8 +3172,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -3013,7 +3216,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
-EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
@@ -3412,7 +3614,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(&new_folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(new_folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3596,19 +3798,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
return ret;
}
-static bool wp_can_reuse_anon_folio(struct folio *folio,
- struct vm_area_struct *vma)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
{
+ bool exclusive = false;
+
+ /* Let's just free up a large folio if only a single page is mapped. */
+ if (folio_large_mapcount(folio) <= 1)
+ return false;
+
/*
- * We could currently only reuse a subpage of a large folio if no
- * other subpages of the large folios are still mapped. However,
- * let's just consistently not reuse subpages even if we could
- * reuse in that scenario, and give back a large folio a bit
- * sooner.
+ * The assumption for anonymous folios is that each page can only get
+ * mapped once into each MM. The only exception are KSM folios, which
+ * are always small.
+ *
+ * Each taken mapcount must be paired with exactly one taken reference,
+ * whereby the refcount must be incremented before the mapcount when
+ * mapping a page, and the refcount must be decremented after the
+ * mapcount when unmapping a page.
+ *
+ * If all folio references are from mappings, and all mappings are in
+ * the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
+ return false;
+
+ VM_WARN_ON_ONCE(folio_test_ksm(folio));
+
+ if (unlikely(folio_test_swapcache(folio))) {
+ /*
+ * Note: freeing up the swapcache will fail if some PTEs are
+ * still swap entries.
+ */
+ if (!folio_trylock(folio))
+ return false;
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
return false;
+ /* Stabilize the mapcount vs. refcount and recheck. */
+ folio_lock_large_mapcount(folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
+
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
+ goto unlock;
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
+ goto unlock;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
+ folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
+
+ /*
+ * Do we need the folio lock? Likely not. If there would have been
+ * references from page migration/swapout, we would have detected
+ * an additional folio reference and never ended up here.
+ */
+ exclusive = true;
+unlock:
+ folio_unlock_large_mapcount(folio);
+ return exclusive;
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static bool wp_can_reuse_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
+ return __wp_can_reuse_large_anon_folio(folio, vma);
+
/*
* We have to verify under folio lock: these early checks are
* just an optimization to avoid locking the folio and freeing
@@ -3717,13 +3986,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
+ * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if (!vmf->page)
+ if (!vmf->page || is_fsdax_page(vmf->page)) {
+ vmf->page = NULL;
return wp_pfn_shared(vmf);
+ }
return wp_page_shared(vmf, folio);
}
@@ -3913,7 +4184,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
folio_put(folio);
return ret;
}
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
mmu_notifier_invalidate_range_start(&range);
@@ -3921,7 +4192,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
- restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
+ restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
+ vmf->pte, vmf->orig_pte);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4043,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
@@ -4189,8 +4441,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
return folio;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
folio_put(folio);
}
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
order = next_order(&orders, order);
}
@@ -4267,10 +4521,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ struct dev_pagemap *pgmap;
+
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pgmap = page_pgmap(vmf->page);
+ ret = pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
@@ -4324,7 +4586,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
need_clear_cache = true;
- mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+ memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -4388,8 +4650,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
@@ -4733,12 +4995,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
folio_throttle_swaprate(folio, gfp);
/*
* When a folio is not zeroed during allocation
- * (__GFP_ZERO not used), folio_zero_user() is used
- * to make sure that the page corresponding to the
- * faulting address will be hot in the cache after
- * zeroing.
+ * (__GFP_ZERO not used) or user folios require special
+ * handling, folio_zero_user() is used to make sure
+ * that the page corresponding to the faulting address
+ * will be hot in the cache after zeroing.
*/
- if (!alloc_zeroed())
+ if (user_alloc_needs_zeroing())
folio_zero_user(folio, vmf->address);
return folio;
}
@@ -4822,7 +5084,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*/
__folio_mark_uptodate(folio);
- entry = mk_pte(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry), vma);
@@ -4947,9 +5209,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -4997,7 +5258,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
flush_icache_pages(vma, page, HPAGE_PMD_NR);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -5022,7 +5283,7 @@ out:
return ret;
}
#else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
return VM_FAULT_FALLBACK;
}
@@ -5054,6 +5315,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_write(entry) && folio_test_dirty(folio))
+ entry = pte_mkdirty(entry);
if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
@@ -5102,7 +5365,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5110,6 +5377,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
else
page = vmf->page;
+ folio = page_folio(page);
/*
* check even for read faults because we might have lost our CoWed
* page
@@ -5121,8 +5389,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
}
if (pmd_none(*vmf->pmd)) {
- if (PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
+ if (folio_test_pmd_mappable(folio)) {
+ ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
@@ -5133,7 +5401,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- folio = page_folio(page);
nr_pages = folio_nr_pages(folio);
/*
@@ -5141,7 +5408,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5177,9 +5445,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -5488,7 +5756,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
* Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
*flags |= TNF_SHARED;
/*
* For memory tiering mode, cpupid of slow memory page is used
@@ -5625,7 +5893,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
ignore_writable = true;
/* Migrate to the requested node */
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
nid = target_nid;
flags |= TNF_MIGRATED;
task_numa_fault(last_cpupid, nid, nr_pages, flags);
@@ -5696,7 +5964,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
split:
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -6069,7 +6337,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
}
/*
- * By the time we get here, we already hold the mm semaphore
+ * By the time we get here, we already hold either the VMA lock or the
+ * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
@@ -6141,173 +6410,6 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- if (likely(mmap_read_trylock(mm)))
- return true;
-
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
-
- return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalend to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
-
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
-
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
-
- if (expand_stack_locked(vma, addr))
- goto fail;
-
-success:
- mmap_write_downgrade(mm);
- return vma;
-
-fail:
- mmap_write_unlock(mm);
- return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
-
- rcu_read_lock();
-retry:
- vma = mas_walk(&mas);
- if (!vma)
- goto inval;
-
- if (!vma_start_read(vma))
- goto inval;
-
- /* Check if the VMA got isolated after we found it */
- if (vma->detached) {
- vma_end_read(vma);
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
-
- /* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
- rcu_read_unlock();
- return vma;
-
-inval_end_read:
- vma_end_read(vma);
-inval:
- rcu_read_unlock();
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
@@ -6388,6 +6490,7 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
args->lock = lock;
args->ptep = ptep;
args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+ args->addr_mask = addr_mask;
args->pgprot = pgprot;
args->writable = writable;
args->special = special;
@@ -6547,7 +6650,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
- unsigned long prot = 0;
+ pgprot_t prot = __pgprot(0);
void __iomem *maddr;
int offset = offset_in_page(addr);
int ret = -EINVAL;
@@ -6557,7 +6660,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pfnmap_start(&args))
return -EINVAL;
- prot = pgprot_val(args.pgprot);
+ prot = args.pgprot;
phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
writable = args.writable;
follow_pfnmap_end(&args);
@@ -6572,7 +6675,7 @@ retry:
if (follow_pfnmap_start(&args))
goto out_unmap;
- if ((prot != pgprot_val(args.pgprot)) ||
+ if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
(phys_addr != (args.pfn << PAGE_SHIFT)) ||
(writable != args.writable)) {
follow_pfnmap_end(&args);
@@ -6714,6 +6817,124 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ void *old_buf = buf;
+ int err = 0;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return -EFAULT;
+
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ while (len) {
+ int bytes, offset, retval;
+ void *maddr;
+ struct page *page;
+ struct vm_area_struct *vma = NULL;
+
+ page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
+ if (IS_ERR(page)) {
+ /*
+ * Treat as a total failure for now until we decide how
+ * to handle the CONFIG_HAVE_IOREMAP_PROT case and
+ * stack expansion.
+ */
+ *(char *)buf = '\0';
+ err = -EFAULT;
+ goto out;
+ }
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE - 1);
+ if (bytes > PAGE_SIZE - offset)
+ bytes = PAGE_SIZE - offset;
+
+ maddr = kmap_local_page(page);
+ retval = strscpy(buf, maddr + offset, bytes);
+ if (retval >= 0) {
+ /* Found the end of the string */
+ buf += retval;
+ unmap_and_put_page(page, maddr);
+ break;
+ }
+
+ buf += bytes - 1;
+ /*
+ * Because strscpy always NUL terminates we need to
+ * copy the last byte in the page if we are going to
+ * load more pages
+ */
+ if (bytes != len) {
+ addr += bytes - 1;
+ copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
+ buf += 1;
+ addr += 1;
+ }
+ len -= bytes;
+
+ unmap_and_put_page(page, maddr);
+ }
+
+out:
+ mmap_read_unlock(mm);
+ if (err)
+ return err;
+ return buf - old_buf;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/*
* Print the name of a VMA.
*/
@@ -6746,10 +6967,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
@@ -6815,9 +7034,10 @@ static inline int process_huge_page(
return 0;
}
-static void clear_gigantic_page(struct folio *folio, unsigned long addr,
+static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
unsigned int nr_pages)
{
+ unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
int i;
might_sleep();
@@ -6851,13 +7071,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
}
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
- unsigned long addr,
+ unsigned long addr_hint,
struct vm_area_struct *vma,
unsigned int nr_pages)
{
- int i;
+ unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
struct page *dst_page;
struct page *src_page;
+ int i;
for (i = 0; i < nr_pages; i++) {
dst_page = folio_page(dst, i);
@@ -6959,7 +7180,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
+ if (ptdesc->ptl)
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c43b4e7fb298..b1caedbade5b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -219,11 +219,30 @@ void put_online_mems(void)
bool movable_node_enabled = false;
-#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int mhp_default_online_type = MMOP_OFFLINE;
-#else
-int mhp_default_online_type = MMOP_ONLINE;
-#endif
+static int mhp_default_online_type = -1;
+int mhp_get_default_online_type(void)
+{
+ if (mhp_default_online_type >= 0)
+ return mhp_default_online_type;
+
+ if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE))
+ mhp_default_online_type = MMOP_OFFLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO))
+ mhp_default_online_type = MMOP_ONLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL))
+ mhp_default_online_type = MMOP_ONLINE_KERNEL;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE))
+ mhp_default_online_type = MMOP_ONLINE_MOVABLE;
+ else
+ mhp_default_online_type = MMOP_OFFLINE;
+
+ return mhp_default_online_type;
+}
+
+void mhp_set_default_online_type(int online_type)
+{
+ mhp_default_online_type = online_type;
+}
static int __init setup_memhp_default_state(char *str)
{
@@ -650,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* this and the first chunk to online will be pageblock_nr_pages.
*/
for (pfn = start_pfn; pfn < end_pfn;) {
+ struct page *page = pfn_to_page(pfn);
int order;
/*
@@ -664,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
else
order = MAX_PAGE_ORDER;
- (*online_page_callback)(pfn_to_page(pfn), order);
+ /*
+ * Exposing the page to the buddy by freeing can cause
+ * issues with debug_pagealloc enabled: some archs don't
+ * like double-unmappings. So treat them like any pages that
+ * were allocated from the buddy.
+ */
+ debug_pagealloc_map_pages(page, 1 << order);
+ (*online_page_callback)(page, order);
pfn += (1UL << order);
}
@@ -1320,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = mhp_default_online_type;
+ mem->online_type = mhp_get_default_online_type();
return device_online(&mem->dev);
}
@@ -1567,7 +1594,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
merge_system_ram_resource(res);
/* online pages if requested */
- if (mhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_get_default_online_type() != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1729,12 +1756,10 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
{
unsigned long pfn;
- for (pfn = start; pfn < end; pfn++) {
+ for_each_valid_pfn(pfn, start, end) {
struct page *page;
struct folio *folio;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
if (PageLRU(page))
goto found;
@@ -1778,43 +1803,32 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for_each_valid_pfn(pfn, start_pfn, end_pfn) {
struct page *page;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
- /*
- * No reference or lock is held on the folio, so it might
- * be modified concurrently (e.g. split). As such,
- * folio_nr_pages() may read garbage. This is fine as the outer
- * loop will revisit the split folio later.
- */
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (keep the unmap as the catch all safety net).
- */
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
- if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
- continue;
- }
-
- if (!folio_try_get(folio))
- continue;
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
- if (unlikely(page_folio(page) != folio))
goto put_folio;
+ }
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
@@ -1830,7 +1844,7 @@ put_folio:
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
.reason = MR_MEMORY_HOTPLUG,
};
int ret;
@@ -1992,8 +2006,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE,
- GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
+ MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bb37cd1a51d8..3b1dfd08338b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,10 +109,12 @@
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
@@ -139,31 +141,138 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
+ */
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
*/
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+static DEFINE_MUTEX(wi_state_lock);
static u8 get_il_weight(int node)
{
- u8 *table;
- u8 weight;
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- /* if no iw_table, use system default */
- weight = table ? table[node] : 1;
- /* if value in iw_table is 0, use system default */
- weight = weight ? weight : 1;
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
rcu_read_unlock();
return weight;
}
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -196,6 +305,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -535,6 +675,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct vm_area_struct *vma = walk->vma;
struct folio *folio;
struct queue_pages *qp = walk->private;
@@ -542,6 +683,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -555,7 +697,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -567,6 +711,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, addr, pte, ptent,
+ max_nr, fpb_flags,
+ NULL, NULL, NULL);
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
@@ -599,7 +747,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
- qp->nr_failed++;
+ qp->nr_failed += nr;
if (strictly_unmovable(flags))
break;
}
@@ -642,12 +790,12 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
- (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
- if (!isolate_hugetlb(folio, qp->pagelist))
+ (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
+ if (!folio_isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
spin_unlock(ptl);
@@ -1033,10 +1181,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
@@ -1080,6 +1228,10 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
mmap_read_lock(mm);
vma = find_vma(mm, 0);
+ if (unlikely(!vma)) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
/*
* This does not migrate the range, but isolates all pages that
@@ -1979,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
+ struct weighted_interleave_state *state;
nodemask_t nodemask;
unsigned int target, nr_nodes;
- u8 *table;
+ u8 *table = NULL;
unsigned int weight_total = 0;
u8 weight;
- int nid;
+ int nid = 0;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
- table = rcu_dereference(iw_table);
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
/* calculate the total weight */
- for_each_node_mask(nid, nodemask) {
- /* detect system default usage */
- weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
- weight_total += weight;
- }
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
/* Calculate the node offset based on totals */
target = ilx % weight_total;
@@ -2006,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
@@ -2201,9 +2354,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
if (!page)
- page = __alloc_pages_noprof(gfp, order, nid, NULL);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
return page;
}
@@ -2218,7 +2371,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
+static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
nodemask_t *nodemask;
@@ -2249,8 +2402,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node_noprof(nid,
- gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ page = __alloc_frozen_pages_noprof(
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order,
+ nid, NULL);
if (page || !(gfp & __GFP_DIRECT_RECLAIM))
return page;
/*
@@ -2262,9 +2416,10 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
}
}
- page = __alloc_pages_noprof(gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
- if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
+ if (unlikely(pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
if (static_branch_likely(&vm_numa_stat_key) &&
page_to_nid(page) == nid) {
@@ -2280,8 +2435,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
- return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP,
- order, pol, ilx, nid));
+ struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
+ ilx, nid);
+ if (!page)
+ return NULL;
+
+ set_page_refcounted(page);
+ return page_rmappable_folio(page);
}
/**
@@ -2295,7 +2455,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
* VMA to prevent it from going away. Should be used for all allocations
* for folios that will be mapped into user space, excepting hugetlbfs, and
- * excepting where direct use of alloc_pages_mpol() is more appropriate.
+ * excepting where direct use of folio_alloc_mpol() is more appropriate.
*
* Return: The folio on success or NULL if allocation fails.
*/
@@ -2316,6 +2476,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);
+struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = &default_policy;
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
+ numa_node_id());
+}
+
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
@@ -2332,17 +2507,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof);
*/
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
- struct mempolicy *pol = &default_policy;
-
- /*
- * No reference counting needed for current->mempolicy
- * nor system default_policy
- */
- if (!in_interrupt() && !(gfp & __GFP_THISNODE))
- pol = get_task_policy(current);
+ struct page *page = alloc_frozen_pages_noprof(gfp, order);
- return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
- numa_node_id());
+ if (page)
+ set_page_refcounted(page);
+ return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);
@@ -2352,7 +2521,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
}
EXPORT_SYMBOL(folio_alloc_noprof);
-static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2371,13 +2540,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
if (delta) {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node + 1, NULL,
+ nr_pages_per_node + 1,
page_array);
delta--;
} else {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node, NULL, page_array);
+ nr_pages_per_node, page_array);
}
page_array += nr_allocated;
@@ -2387,17 +2556,18 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
+ struct weighted_interleave_state *state;
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
- u8 *table, *weights, weight;
+ u8 *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
@@ -2426,7 +2596,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (weight && node_isset(node, nodes)) {
node_pages = min(rem_pages, weight);
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
/* if that's all the pages, no need to interleave */
@@ -2447,17 +2617,19 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- if (table)
- memcpy(weights, table, nr_node_ids);
- rcu_read_unlock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
/* calculate total, detect system default usage */
- for_each_node_mask(node, nodes) {
- if (!weights[node])
- weights[node] = 1;
+ for_each_node_mask(node, nodes)
weight_total += weights[node];
- }
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -2489,7 +2661,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (!node_pages)
break;
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
if (total_allocated == nr_pages)
@@ -2502,7 +2674,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2513,11 +2685,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
if (nr_allocated < nr_pages)
nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
- nr_pages - nr_allocated, NULL,
+ nr_pages - nr_allocated,
page_array + nr_allocated);
return nr_allocated;
}
@@ -2528,7 +2700,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
@@ -2539,21 +2711,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_pages_bulk_array_interleave(gfp, pol,
+ return alloc_pages_bulk_interleave(gfp, pol,
nr_pages, page_array);
if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
- return alloc_pages_bulk_array_weighted_interleave(
+ return alloc_pages_bulk_weighted_interleave(
gfp, pol, nr_pages, page_array);
if (pol->mode == MPOL_PREFERRED_MANY)
- return alloc_pages_bulk_array_preferred_many(gfp,
+ return alloc_pages_bulk_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
return alloc_pages_bulk_noprof(gfp, nid, nodemask,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -3368,6 +3540,14 @@ struct iw_node_attr {
int nid;
};
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -3382,177 +3562,316 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
struct iw_node_attr *node_attr;
- u8 *new;
- u8 *old;
u8 weight = 0;
+ int i;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
- if (count == 0 || sysfs_streq(buf, ""))
- weight = 0;
- else if (kstrtou8(buf, 0, &weight))
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
return -EINVAL;
- new = kzalloc(nr_node_ids, GFP_KERNEL);
- if (!new)
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
return -ENOMEM;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
return count;
}
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- if (!node_attr)
- return;
- sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
int i;
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state) {
+ mutex_unlock(&wi_state_lock);
+ return;
+ }
+
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+ synchronize_rcu();
+ kfree(old_wi_state);
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
{
- struct iw_node_attr *node_attr;
+ int ret;
char *name;
+ struct iw_node_attr *new_attr;
- node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
- if (!node_attr)
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
- kfree(node_attr);
+ kfree(new_attr);
return -ENOMEM;
}
- sysfs_attr_init(&node_attr->kobj_attr.attr);
- node_attr->kobj_attr.attr.name = name;
- node_attr->kobj_attr.attr.mode = 0644;
- node_attr->kobj_attr.show = node_show;
- node_attr->kobj_attr.store = node_store;
- node_attr->nid = nid;
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
- if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
- pr_err("failed to add attribute to weighted_interleave\n");
- return -ENOMEM;
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
}
- node_attrs[nid] = node_attr;
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
}
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct memory_notify *arg = data;
+ int nid = arg->status_change_nid;
+
+ if (nid < 0)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case MEM_ONLINE:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case MEM_OFFLINE:
+ sysfs_wi_node_delete(nid);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
- struct kobject *wi_kobj;
int nid, err;
- wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
- err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
- for_each_node_state(nid, N_POSSIBLE) {
- err = add_weight_node(nid, wi_kobj);
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
if (err) {
- pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
- return 0;
-}
-static void mempolicy_kobj_release(struct kobject *kobj)
-{
- u8 *old;
+ hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
- kfree(kobj);
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
-static const struct kobj_type mempolicy_ktype = {
- .release = mempolicy_kobj_release
-};
-
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
- mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
- err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
- "mempolicy");
+ err = add_weighted_interleave_group(mempolicy_kobj);
if (err)
- goto node_out;
+ goto err_kobj;
- err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ return 0;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
return err;
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 40d4547ce514..c417c843e9b1 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -130,7 +130,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
pgmap_array_delete(range);
}
@@ -211,8 +211,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
- range_len(range));
+ error = pfnmap_track(PHYS_PFN(range->start), range_len(range),
+ &params->pgprot);
if (error)
goto err_pfn_remap;
@@ -277,7 +277,7 @@ err_add_memory:
if (!is_private)
kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
pgmap_array_delete(range);
return error;
@@ -458,8 +458,9 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_folio(struct folio *folio)
{
- if (WARN_ON_ONCE(!folio->page.pgmap->ops ||
- !folio->page.pgmap->ops->page_free))
+ struct dev_pagemap *pgmap = folio->pgmap;
+
+ if (WARN_ON_ONCE(!pgmap))
return;
mem_cgroup_uncharge(folio);
@@ -484,19 +485,42 @@ void free_zone_device_folio(struct folio *folio)
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
* to clear folio->mapping.
+ *
+ * FS DAX pages clear the mapping when the folio->share count hits
+ * zero which indicating the page has been removed from the file
+ * system mapping.
*/
- folio->mapping = NULL;
- folio->page.pgmap->ops->page_free(folio_page(folio, 0));
+ if (pgmap->type != MEMORY_DEVICE_FS_DAX &&
+ pgmap->type != MEMORY_DEVICE_GENERIC)
+ folio->mapping = NULL;
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ break;
+ pgmap->ops->page_free(folio_page(folio, 0));
+ put_dev_pagemap(pgmap);
+ break;
- if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
- folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
+ case MEMORY_DEVICE_GENERIC:
/*
* Reset the refcount to 1 to prepare for handing out the page
* again.
*/
folio_set_count(folio, 1);
- else
- put_dev_pagemap(folio->page.pgmap);
+ break;
+
+ case MEMORY_DEVICE_FS_DAX:
+ wake_up_var(&folio->page);
+ break;
+
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ break;
+ pgmap->ops->page_free(folio_page(folio, 0));
+ break;
+ }
}
void zone_device_page_init(struct page *page)
@@ -505,26 +529,8 @@ void zone_device_page_init(struct page *page)
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
*/
- WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
set_page_count(page, 1);
lock_page(page);
}
EXPORT_SYMBOL_GPL(zone_device_page_init);
-
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX)
- return false;
-
- /*
- * fsdax page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (folio_ref_sub_return(folio, refs) == 1)
- wake_up_var(&folio->_refcount);
- return true;
-}
-EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
-#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index 2ce6b4b814df..8cf0f9c9599d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -50,6 +50,7 @@
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
bool isolate_movable_page(struct page *page, isolate_mode_t mode)
{
@@ -68,10 +69,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (!folio)
goto out;
- if (unlikely(folio_test_slab(folio)))
- goto out_putfolio;
- /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
- smp_rmb();
/*
* Check movable flag before taking the page lock because
* we use non-atomic bitops on newly allocated page flags so
@@ -79,10 +76,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
*/
if (unlikely(!__folio_test_movable(folio)))
goto out_putfolio;
- /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
- smp_rmb();
- if (unlikely(folio_test_slab(folio)))
- goto out_putfolio;
/*
* As movable pages are not isolated from LRU lists, concurrent
@@ -136,7 +129,7 @@ static void putback_movable_folio(struct folio *folio)
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_hugetlb().
+ * and folio_isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -145,7 +138,7 @@ void putback_movable_pages(struct list_head *l)
list_for_each_entry_safe(folio, folio2, l, lru) {
if (unlikely(folio_test_hugetlb(folio))) {
- folio_putback_active_hugetlb(folio);
+ folio_putback_hugetlb(folio);
continue;
}
list_del(&folio->lru);
@@ -177,7 +170,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
bool isolated, lru;
if (folio_test_hugetlb(folio))
- return isolate_hugetlb(folio, list);
+ return folio_isolate_hugetlb(folio, list);
lru = !__folio_test_movable(folio);
if (lru)
@@ -210,7 +203,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+ VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm))
@@ -336,7 +329,7 @@ static bool remove_migration_pte(struct folio *folio,
folio_add_file_rmap_pte(folio, new, vma);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
- if (vma->vm_flags & VM_LOCKED)
+ if (READ_ONCE(vma->vm_flags) & VM_LOCKED)
mlock_drain_local();
trace_remove_migration_pte(pvmw.address, pte_val(pte),
@@ -453,20 +446,6 @@ unlock:
}
#endif
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
-
/*
* Replace the folio in the mapping.
*
@@ -526,15 +505,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
@@ -611,7 +588,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
@@ -628,7 +605,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src));
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -745,7 +722,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_readahead(newfolio);
folio_copy_owner(newfolio, folio);
- pgalloc_tag_copy(newfolio, folio);
+ pgalloc_tag_swap(newfolio, folio);
mem_cgroup_migrate(folio, newfolio);
}
@@ -759,7 +736,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, void *src_private,
enum migrate_mode mode)
{
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
@@ -847,7 +824,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = folio_expected_refs(mapping, src);
+ expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -855,9 +832,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return -EAGAIN;
if (check_refs) {
- bool busy;
+ bool busy, migrating;
bool invalidated = false;
+ migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
+ VM_WARN_ON_ONCE(migrating);
recheck_buffers:
busy = false;
spin_lock(&mapping->i_private_lock);
@@ -869,12 +848,12 @@ recheck_buffers:
}
bh = bh->b_this_page;
} while (bh != head);
+ spin_unlock(&mapping->i_private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
- spin_unlock(&mapping->i_private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
@@ -893,7 +872,7 @@ recheck_buffers:
unlock_buffers:
if (check_refs)
- spin_unlock(&mapping->i_private_lock);
+ clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head;
do {
unlock_buffer(bh);
@@ -955,66 +934,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
@@ -1459,7 +1392,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
- folio_putback_active_hugetlb(src);
+ folio_putback_hugetlb(src);
return MIGRATEPAGE_SUCCESS;
}
@@ -1542,19 +1475,19 @@ out_unlock:
folio_unlock(src);
out:
if (rc == MIGRATEPAGE_SUCCESS)
- folio_putback_active_hugetlb(src);
+ folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
/*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, put_page() will drop the reference grabbed during
- * isolation.
+ * If migration was not successful and there's a freeing callback,
+ * return the folio to that special allocator. Otherwise, simply drop
+ * our additional reference.
*/
if (put_new_folio)
put_new_folio(dst, private);
else
- folio_putback_active_hugetlb(dst);
+ folio_put(dst);
return rc;
}
@@ -1695,6 +1628,81 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
return nr_failed;
}
+static void migrate_folios_move(struct list_head *src_folios,
+ struct list_head *dst_folios,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct list_head *ret_folios,
+ struct migrate_pages_stats *stats,
+ int *retry, int *thp_retry, int *nr_failed,
+ int *nr_retry_pages)
+{
+ struct folio *folio, *folio2, *dst, *dst2;
+ bool is_thp;
+ int nr_pages;
+ int rc;
+
+ dst = list_first_entry(dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ rc = migrate_folio_move(put_new_folio, private,
+ folio, dst, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: folio will be freed
+ * -EAGAIN: stay on the unmap_folios list
+ * Other errno: put on ret_folios list
+ */
+ switch (rc) {
+ case -EAGAIN:
+ *retry += 1;
+ *thp_retry += is_thp;
+ *nr_retry_pages += nr_pages;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ default:
+ *nr_failed += 1;
+ stats->nr_thp_failed += is_thp;
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+}
+
+static void migrate_folios_undo(struct list_head *src_folios,
+ struct list_head *dst_folios,
+ free_folio_t put_new_folio, unsigned long private,
+ struct list_head *ret_folios)
+{
+ struct folio *folio, *folio2, *dst, *dst2;
+
+ dst = list_first_entry(dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+ int old_page_state = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ anon_vma, true, ret_folios);
+ list_del(&dst->lru);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+}
+
/*
* migrate_pages_batch() first unmaps folios in the from list as many as
* possible, then move the unmapped folios.
@@ -1717,7 +1725,7 @@ static int migrate_pages_batch(struct list_head *from,
int pass = 0;
bool is_thp = false;
bool is_large = false;
- struct folio *folio, *folio2, *dst = NULL, *dst2;
+ struct folio *folio, *folio2, *dst = NULL;
int rc, rc_saved = 0, nr_pages;
LIST_HEAD(unmap_folios);
LIST_HEAD(dst_folios);
@@ -1888,42 +1896,11 @@ move:
thp_retry = 0;
nr_retry_pages = 0;
- dst = list_first_entry(&dst_folios, struct folio, lru);
- dst2 = list_next_entry(dst, lru);
- list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
- nr_pages = folio_nr_pages(folio);
-
- cond_resched();
-
- rc = migrate_folio_move(put_new_folio, private,
- folio, dst, mode,
- reason, ret_folios);
- /*
- * The rules are:
- * Success: folio will be freed
- * -EAGAIN: stay on the unmap_folios list
- * Other errno: put on ret_folios list
- */
- switch(rc) {
- case -EAGAIN:
- retry++;
- thp_retry += is_thp;
- nr_retry_pages += nr_pages;
- break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- default:
- nr_failed++;
- stats->nr_thp_failed += is_thp;
- stats->nr_failed_pages += nr_pages;
- break;
- }
- dst = dst2;
- dst2 = list_next_entry(dst, lru);
- }
+ /* Move the unmapped folios */
+ migrate_folios_move(&unmap_folios, &dst_folios,
+ put_new_folio, private, mode, reason,
+ ret_folios, stats, &retry, &thp_retry,
+ &nr_failed, &nr_retry_pages);
}
nr_failed += retry;
stats->nr_thp_failed += thp_retry;
@@ -1932,20 +1909,8 @@ move:
rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
- dst = list_first_entry(&dst_folios, struct folio, lru);
- dst2 = list_next_entry(dst, lru);
- list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- int old_page_state = 0;
- struct anon_vma *anon_vma = NULL;
-
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
- anon_vma, true, ret_folios);
- list_del(&dst->lru);
- migrate_folio_undo_dst(dst, true, put_new_folio, private);
- dst = dst2;
- dst2 = list_next_entry(dst, lru);
- }
+ migrate_folios_undo(&unmap_folios, &dst_folios,
+ put_new_folio, private, ret_folios);
return rc;
}
@@ -2204,11 +2169,11 @@ static int __add_folio_for_migration(struct folio *folio, int node,
if (folio_nid(folio) == node)
return 0;
- if (folio_likely_mapped_shared(folio) && !migrate_all)
+ if (folio_maybe_mapped_shared(folio) && !migrate_all)
return -EACCES;
if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(folio, pagelist))
+ if (folio_isolate_hugetlb(folio, pagelist))
return 1;
} else if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, pagelist);
@@ -2629,11 +2594,10 @@ int migrate_misplaced_folio_prepare(struct folio *folio,
* processes with execute permissions as they are probably
* shared libraries.
*
- * See folio_likely_mapped_shared() on possible imprecision
+ * See folio_maybe_mapped_shared() on possible imprecision
* when we cannot easily detect if a folio is shared.
*/
- if ((vma->vm_flags & VM_EXEC) &&
- folio_likely_mapped_shared(folio))
+ if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio))
return -EACCES;
/*
@@ -2683,8 +2647,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio,
* elevated reference count on the folio. This function will un-isolate the
* folio, dereferencing the folio before returning.
*/
-int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
- int node)
+int migrate_misplaced_folio(struct folio *folio, int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int nr_remaining;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 9cf26592ac93..3158afe7eb23 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@ again:
folio_get(folio);
spin_unlock(ptl);
+ /* FIXME: we don't expect THP for fault_folio */
+ if (WARN_ON_ONCE(fault_folio == folio))
+ return migrate_vma_collect_skip(start, end,
+ walk);
if (unlikely(!folio_trylock(folio)))
return migrate_vma_collect_skip(start, end,
walk);
ret = split_folio(folio);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
if (ret)
return migrate_vma_collect_skip(start, end,
@@ -106,6 +113,7 @@ again:
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ struct dev_pagemap *pgmap;
unsigned long mpfn = 0, pfn;
struct folio *folio;
struct page *page;
@@ -133,9 +141,10 @@ again:
goto next;
page = pfn_swap_entry_to_page(entry);
+ pgmap = page_pgmap(page);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
- page->pgmap->owner != migrate->pgmap_owner)
+ pgmap->owner != migrate->pgmap_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
@@ -152,12 +161,16 @@ again:
}
page = vm_normal_page(migrate->vma, addr, pte);
if (page && !is_zone_device_page(page) &&
- !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
- else if (page && is_device_coherent_page(page) &&
- (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
- page->pgmap->owner != migrate->pgmap_owner))
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
goto next;
+ } else if (page && is_device_coherent_page(page)) {
+ pgmap = page_pgmap(page);
+
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ pgmap->owner != migrate->pgmap_owner)
+ goto next;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -192,7 +205,7 @@ again:
* optimisation to avoid walking the rmap later with
* try_to_migrate().
*/
- if (folio_trylock(folio)) {
+ if (fault_folio == folio || folio_trylock(folio)) {
bool anon_exclusive;
pte_t swp_pte;
@@ -204,7 +217,8 @@ again:
if (folio_try_share_anon_rmap_pte(folio, page)) {
set_pte_at(mm, addr, ptep, pte);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
mpfn = 0;
goto next;
@@ -363,6 +377,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i, restore = 0;
bool allow_drain = true;
unsigned long unmapped = 0;
@@ -427,7 +443,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
restore--;
}
@@ -536,6 +553,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (args->fault_page && !is_device_private_page(args->fault_page))
return -EINVAL;
+ if (args->fault_page && !PageLocked(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -799,19 +818,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_pages);
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
- unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i;
for (i = 0; i < npages; i++) {
@@ -824,6 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!page) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -834,29 +848,43 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
dst = src;
}
+ if (!folio_is_zone_device(dst))
+ folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
- folio_unlock(src);
-
- if (folio_is_zone_device(src))
- folio_put(src);
- else
- folio_putback_lru(src);
+ if (fault_folio != src)
+ folio_unlock(src);
+ folio_put(src);
if (dst != src) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
- if (folio_is_zone_device(dst))
- folio_put(dst);
- else
- folio_putback_lru(dst);
+ folio_put(dst);
}
}
}
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
+{
+ return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
EXPORT_SYMBOL(migrate_device_finalize);
/**
@@ -872,10 +900,27 @@ EXPORT_SYMBOL(migrate_device_finalize);
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
{
- migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+ __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+ migrate->fault_page);
}
EXPORT_SYMBOL(migrate_vma_finalize);
+static unsigned long migrate_device_pfn_lock(unsigned long pfn)
+{
+ struct folio *folio;
+
+ folio = folio_get_nontail_page(pfn_to_page(pfn));
+ if (!folio)
+ return 0;
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return 0;
+ }
+
+ return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+}
+
/**
* migrate_device_range() - migrate device private pfns to normal memory.
* @src_pfns: array large enough to hold migrating source device private pfns.
@@ -900,29 +945,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
{
unsigned long i, pfn;
- for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct folio *folio;
+ for (pfn = start, i = 0; i < npages; pfn++, i++)
+ src_pfns[i] = migrate_device_pfn_lock(pfn);
- folio = folio_get_nontail_page(pfn_to_page(pfn));
- if (!folio) {
- src_pfns[i] = 0;
- continue;
- }
+ migrate_device_unmap(src_pfns, npages, NULL);
- if (!folio_trylock(folio)) {
- src_pfns[i] = 0;
- folio_put(folio);
- continue;
- }
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
- src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- }
+/**
+ * migrate_device_pfns() - migrate device private pfns to normal memory.
+ * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * array of device pages to migrate.
+ */
+int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; i++)
+ src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
migrate_device_unmap(src_pfns, npages, NULL);
return 0;
}
-EXPORT_SYMBOL(migrate_device_range);
+EXPORT_SYMBOL(migrate_device_pfns);
/*
* Migrate a device coherent folio back to normal memory. The caller should have
diff --git a/mm/mincore.c b/mm/mincore.c
index d6bd19e520fc..42d6c9c8da86 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include "swap.h"
+#include "internal.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+ int step, i;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ for (; addr != end; ptep += step, addr += step * PAGE_SIZE) {
pte_t pte = ptep_get(ptep);
+ step = 1;
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
- else if (pte_present(pte))
- *vec = 1;
- else { /* pte is a swap entry */
+ else if (pte_present(pte)) {
+ unsigned int batch = pte_batch_hint(ptep, pte);
+
+ if (batch > 1) {
+ unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+ step = min_t(unsigned int, batch, max_nr);
+ }
+
+ for (i = 0; i < step; i++)
+ vec[i] = 1;
+ } else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
@@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
#endif
}
}
- vec++;
+ vec += step;
}
pte_unmap_unlock(ptep - 1, ptl);
out:
@@ -239,7 +251,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
start = untagged_addr(start);
/* Check the start address: needs to be page-aligned.. */
- if (start & ~PAGE_MASK)
+ if (unlikely(start & ~PAGE_MASK))
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
diff --git a/mm/mlock.c b/mm/mlock.c
index cde076fa7d5e..3cb72b579ffd 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -368,6 +368,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (is_huge_zero_pmd(*pmd))
goto out;
folio = pmd_folio(*pmd);
+ if (folio_is_zone_device(folio))
+ goto out;
if (vma->vm_flags & VM_LOCKED)
mlock_folio(folio);
else
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 24b68b425afb..f2944748f526 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -30,12 +30,29 @@
#include <linux/crash_dump.h>
#include <linux/execmem.h>
#include <linux/vmstat.h>
+#include <linux/kexec_handover.h>
+#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
#include "shuffle.h"
#include <asm/setup.h>
+#ifndef CONFIG_NUMA
+unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
+EXPORT_SYMBOL(mem_map);
+#endif
+
+/*
+ * high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.
+ */
+void *high_memory;
+EXPORT_SYMBOL(high_memory);
+
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
@@ -438,7 +455,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* was requested by the user
*/
required_movablecore =
- roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ round_up(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
@@ -545,11 +562,11 @@ restart:
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
zone_movable_pfn[nid] =
- roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
if (zone_movable_pfn[nid] >= end_pfn)
@@ -649,6 +666,28 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+/*
+ * Initialize a reserved page unconditionally, finding its zone first.
+ */
+void __meminit __init_page_from_nid(unsigned long pfn, int nid)
+{
+ pg_data_t *pgdat;
+ int zid;
+
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_spans_pfn(zone, pfn))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+
+ if (pageblock_aligned(pfn))
+ set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
@@ -705,26 +744,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn, int nid)
+static void __meminit __init_deferred_page(unsigned long pfn, int nid)
{
- pg_data_t *pgdat;
- int zid;
-
if (early_page_initialised(pfn, nid))
return;
- pgdat = NODE_DATA(nid);
-
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
-
- if (zone_spans_pfn(zone, pfn))
- break;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
-
- if (pageblock_aligned(pfn))
- set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+ __init_page_from_nid(pfn, nid);
}
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
@@ -739,11 +764,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn, int nid)
+static inline void __init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void __meminit init_deferred_page(unsigned long pfn, int nid)
+{
+ __init_deferred_page(pfn, nid);
+}
+
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
@@ -753,22 +783,19 @@ static inline void init_reserved_page(unsigned long pfn, int nid)
void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
+ unsigned long pfn;
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
- init_reserved_page(start_pfn, nid);
+ __init_deferred_page(pfn, nid);
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
@@ -804,7 +831,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
- * - non-memory regions covered by the contigious flatmem mapping
+ * - non-memory regions covered by the contiguous flatmem mapping
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
@@ -824,11 +851,7 @@ static void __init init_unavailable_range(unsigned long spfn,
unsigned long pfn;
u64 pgcnt = 0;
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
+ for_each_valid_pfn(pfn, spfn, epfn) {
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
@@ -960,19 +983,19 @@ static void __init memmap_init(void)
}
}
-#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
- * section_end].
+ * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
+ * for FLATMEM.
* Append the pages in this hole to the highest zone in the last
* node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
*/
+#ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
+#else
+ end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
#endif
+ if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
@@ -998,7 +1021,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
- page->pgmap = pgmap;
+ page_folio(page)->pgmap = pgmap;
page->zone_device_data = NULL;
/*
@@ -1017,12 +1040,25 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
}
/*
- * ZONE_DEVICE pages are released directly to the driver page allocator
- * which will set the page count to 1 when allocating the page.
+ * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
+ * directly to the driver page allocator which will set the page count
+ * to 1 when allocating the page.
+ *
+ * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
+ * their refcount reset to one whenever they are freed (ie. after
+ * their refcount drops to 0).
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_COHERENT)
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_FS_DAX:
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ case MEMORY_DEVICE_PCI_P2PDMA:
set_page_count(page, 0);
+ break;
+
+ case MEMORY_DEVICE_GENERIC:
+ break;
+ }
}
/*
@@ -1431,7 +1467,7 @@ void __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_SPARSEMEM
/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
* Start by making sure zonesize is a multiple of pageblock_order by rounding
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
* round what is now in bits to nearest long in bits, then return it in
@@ -1442,10 +1478,10 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
unsigned long usemapsize;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = round_up(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, BITS_PER_LONG);
+ usemapsize = round_up(usemapsize, BITS_PER_LONG);
return usemapsize / BITS_PER_BYTE;
}
@@ -1473,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
@@ -1585,13 +1621,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
{
void *ptr;
+ /*
+ * Kmemleak will explicitly scan mem_map by traversing all valid
+ * `struct *page`,so memblock does not need to be added to the scan list.
+ */
if (exact_nid)
ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
else
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
if (ptr && size > 0)
@@ -1613,7 +1653,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/*
- * The zone's endpoints aren't required to be MAX_PAGE_ORDER
+ * The zone's endpoints aren't required to be MAX_PAGE_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
@@ -1629,14 +1669,15 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
__func__, pgdat->node_id, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NUMA
+
/* the global mem_map is just set as node 0's */
- if (pgdat == NODE_DATA(0)) {
- mem_map = NODE_DATA(0)->node_mem_map;
- if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
- }
-#endif
+ WARN_ON(pgdat != NODE_DATA(0));
+
+ mem_map = pgdat->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+
+ max_mapnr = end - start;
}
#else
static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
@@ -1743,6 +1784,27 @@ static bool arch_has_descending_max_zone_pfns(void)
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
+static void __init set_high_memory(void)
+{
+ phys_addr_t highmem = memblock_end_of_DRAM();
+
+ /*
+ * Some architectures (e.g. ARM) set high_memory very early and
+ * use it in arch setup code.
+ * If an architecture already set high_memory don't overwrite it
+ */
+ if (high_memory)
+ return;
+
+#ifdef CONFIG_HIGHMEM
+ if (arch_has_descending_max_zone_pfns() ||
+ highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]))
+ highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]);
+#endif
+
+ high_memory = phys_to_virt(highmem - 1) + 1;
+}
+
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -1844,7 +1906,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
free_area_init_node(nid);
/*
- * No sysfs hierarcy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_one_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
@@ -1857,11 +1919,16 @@ void __init free_area_init(unsigned long *max_zone_pfn)
}
}
+ for_each_node_state(nid, N_MEMORY)
+ sparse_vmemmap_init_nid_late(nid);
+
calc_nr_kernel_pages();
memmap_init();
/* disable hash distribution for systems with a single node */
fixup_hashdist();
+
+ set_high_memory();
}
/**
@@ -2247,6 +2314,15 @@ void __init init_cma_reserved_pageblock(struct page *page)
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
+/*
+ * Similar to above, but only set the migrate type and stats.
+ */
+void __init init_cma_pageblock(struct page *page)
+{
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+ adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
+}
#endif
void set_zone_contiguous(struct zone *zone)
@@ -2271,6 +2347,31 @@ void set_zone_contiguous(struct zone *zone)
zone->contiguous = true;
}
+/*
+ * Check if a PFN range intersects multiple zones on one or more
+ * NUMA nodes. Specify the @nid argument if it is known that this
+ * PFN range is on one node, NUMA_NO_NODE otherwise.
+ */
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct zone *zone, *izone = NULL;
+
+ for_each_zone(zone) {
+ if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid)
+ continue;
+
+ if (zone_intersects(zone, start_pfn, nr_pages)) {
+ if (izone != NULL)
+ return true;
+ izone = zone;
+ }
+
+ }
+
+ return false;
+}
+
static void __init mem_init_print_info(void);
void __init page_alloc_init_late(void)
{
@@ -2565,12 +2666,6 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
@@ -2632,11 +2727,22 @@ static void __init mem_init_print_info(void)
);
}
+void __init __weak arch_mm_preinit(void)
+{
+}
+
+void __init __weak mem_init(void)
+{
+}
+
/*
* Set up kernel memory allocators
*/
void __init mm_core_init(void)
{
+ arch_mm_preinit();
+ hugetlb_bootmem_alloc();
+
/* Initializations relying on SMP setup */
BUILD_BUG_ON(MAX_ZONELISTS > 2);
build_all_zonelists(NULL);
@@ -2652,6 +2758,14 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+
+ /*
+ * KHO memory setup must happen while memblock is still active, but
+ * as close as possible to buddy initialization
+ */
+ kho_memory_init();
+
+ memblock_free_all();
mem_init();
kmem_cache_init();
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 386429f7db5a..09c563c95112 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -47,6 +47,7 @@
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>
+#include <linux/memfd.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -110,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
return mlock_future_ok(current->mm, current->mm->def_flags, len)
? 0 : -EAGAIN;
}
-static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
- unsigned long addr, unsigned long request, unsigned long flags);
+
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
@@ -277,8 +277,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
return true;
}
-/*
+/**
+ * do_mmap() - Perform a userland memory mapping into the current process
+ * address space of length @len with protection bits @prot, mmap flags @flags
+ * (from which VMA flags will be inferred), and any additional VMA flags to
+ * apply @vm_flags. If this is a file-backed mapping then the file is specified
+ * in @file and page offset into the file via @pgoff.
+ *
+ * This function does not perform security checks on the file and assumes, if
+ * @uf is non-NULL, the caller has provided a list head to track unmap events
+ * for userfaultfd @uf.
+ *
+ * It also simply indicates whether memory population is required by setting
+ * @populate, which must be non-NULL, expecting the caller to actually perform
+ * this task itself if appropriate.
+ *
+ * This function will invoke architecture-specific (and if provided and
+ * relevant, file system-specific) logic to determine the most appropriate
+ * unmapped area in which to place the mapping if not MAP_FIXED.
+ *
+ * Callers which require userland mmap() behaviour should invoke vm_mmap(),
+ * which is also exported for module use.
+ *
+ * Those which require this behaviour less security checks, userfaultfd and
+ * populate behaviour, and who handle the mmap write lock themselves, should
+ * call this function.
+ *
+ * Note that the returned address may reside within a merged VMA if an
+ * appropriate merge were to take place, so it doesn't necessarily specify the
+ * start of a VMA, rather only the start of a valid mapped range of length
+ * @len bytes, rounded down to the nearest page size.
+ *
* The caller must write-lock current->mm->mmap_lock.
+ *
+ * @file: An optional struct file pointer describing the file which is to be
+ * mapped, if a file-backed mapping.
+ * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the
+ * address at which to perform this mapping. See mmap (2) for details. Must be
+ * page-aligned.
+ * @len: The length of the mapping. Will be page-aligned and must be at least 1
+ * page in size.
+ * @prot: Protection bits describing access required to the mapping. See mmap
+ * (2) for details.
+ * @flags: Flags specifying how the mapping should be performed, see mmap (2)
+ * for details.
+ * @vm_flags: VMA flags which should be set by default, or 0 otherwise.
+ * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise.
+ * @populate: A pointer to a value which will be set to 0 if no population of
+ * the range is required, or the number of bytes to populate if it is. Must be
+ * non-NULL. See mmap (2) for details as to under what circumstances population
+ * of the range occurs.
+ * @uf: An optional pointer to a list head to track userfaultfd unmap events
+ * should unmapping events arise. If provided, it is up to the caller to manage
+ * this.
+ *
+ * Returns: Either an error, or the address at which the requested mapping has
+ * been performed.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
@@ -291,6 +345,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
*populate = 0;
+ mmap_assert_write_locked(mm);
+
if (!len)
return -EINVAL;
@@ -369,6 +425,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
+ int err;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
@@ -418,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!file_has_valid_mmap_hooks(file))
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
@@ -427,6 +484,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
default:
return -EINVAL;
}
+
+ /*
+ * Check to see if we are violating any seals and update VMA
+ * flags if necessary to avoid future seal violations.
+ */
+ err = memfd_check_seals_mmap(file, &vm_flags);
+ if (err)
+ return (unsigned long)err;
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
@@ -577,115 +642,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-/**
- * unmapped_area() - Find an area between the low_limit and the high_limit with
- * the correct alignment and offset, all from @info. Note: current->mm is used
- * for the search.
- *
- * @info: The unmapped area information including the range [low_limit -
- * high_limit), the alignment offset and mask.
- *
- * Return: A memory address or -ENOMEM.
- */
-static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
-{
- unsigned long length, gap;
- unsigned long low_limit, high_limit;
- struct vm_area_struct *tmp;
- VMA_ITERATOR(vmi, current->mm, 0);
-
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask + info->start_gap;
- if (length < info->length)
- return -ENOMEM;
-
- low_limit = info->low_limit;
- if (low_limit < mmap_min_addr)
- low_limit = mmap_min_addr;
- high_limit = info->high_limit;
-retry:
- if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
- return -ENOMEM;
-
- /*
- * Adjust for the gap first so it doesn't interfere with the
- * later alignment. The first step is the minimum needed to
- * fulill the start gap, the next steps is the minimum to align
- * that. It is the minimum needed to fulill both.
- */
- gap = vma_iter_addr(&vmi) + info->start_gap;
- gap += (info->align_offset - gap) & info->align_mask;
- tmp = vma_next(&vmi);
- if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) < gap + length - 1) {
- low_limit = tmp->vm_end;
- vma_iter_reset(&vmi);
- goto retry;
- }
- } else {
- tmp = vma_prev(&vmi);
- if (tmp && vm_end_gap(tmp) > gap) {
- low_limit = vm_end_gap(tmp);
- vma_iter_reset(&vmi);
- goto retry;
- }
- }
-
- return gap;
-}
-
-/**
- * unmapped_area_topdown() - Find an area between the low_limit and the
- * high_limit with the correct alignment and offset at the highest available
- * address, all from @info. Note: current->mm is used for the search.
- *
- * @info: The unmapped area information including the range [low_limit -
- * high_limit), the alignment offset and mask.
- *
- * Return: A memory address or -ENOMEM.
- */
-static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
-{
- unsigned long length, gap, gap_end;
- unsigned long low_limit, high_limit;
- struct vm_area_struct *tmp;
- VMA_ITERATOR(vmi, current->mm, 0);
-
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask + info->start_gap;
- if (length < info->length)
- return -ENOMEM;
-
- low_limit = info->low_limit;
- if (low_limit < mmap_min_addr)
- low_limit = mmap_min_addr;
- high_limit = info->high_limit;
-retry:
- if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
- return -ENOMEM;
-
- gap = vma_iter_end(&vmi) - info->length;
- gap -= (gap - info->align_offset) & info->align_mask;
- gap_end = vma_iter_end(&vmi);
- tmp = vma_next(&vmi);
- if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) < gap_end) {
- high_limit = vm_start_gap(tmp);
- vma_iter_reset(&vmi);
- goto retry;
- }
- } else {
- tmp = vma_prev(&vmi);
- if (tmp && vm_end_gap(tmp) > gap) {
- high_limit = tmp->vm_start;
- vma_iter_reset(&vmi);
- goto retry;
- }
- }
-
- return gap;
-}
-
/*
* Determine if the allocation needs to ensure that there is no
* existing mapping within it's guard gaps, for use as start_gap.
@@ -888,7 +844,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (get_area) {
addr = get_area(file, addr, len, pgoff, flags);
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
+ && !addr /* no hint */
&& IS_ALIGNED(len, PMD_SIZE)) {
/* Ensures that larger anonymous mappings are THP aligned. */
addr = thp_get_unmapped_area_vmflags(file, addr, len,
@@ -984,211 +941,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
return vma;
}
-/*
- * Verify that the stack growth is acceptable and
- * update accounting. This is shared with both the
- * grow-up and grow-down cases.
- */
-static int acct_stack_growth(struct vm_area_struct *vma,
- unsigned long size, unsigned long grow)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long new_start;
-
- /* address space limit tests */
- if (!may_expand_vm(mm, vma->vm_flags, grow))
- return -ENOMEM;
-
- /* Stack limit test */
- if (size > rlimit(RLIMIT_STACK))
- return -ENOMEM;
-
- /* mlock limit tests */
- if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
- return -ENOMEM;
-
- /* Check to ensure the stack will not grow into a hugetlb-only region */
- new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
- vma->vm_end - size;
- if (is_hugepage_only_range(vma->vm_mm, new_start, size))
- return -EFAULT;
-
- /*
- * Overcommit.. This must be the final test, as it will
- * update security statistics.
- */
- if (security_vm_enough_memory_mm(mm, grow))
- return -ENOMEM;
-
- return 0;
-}
-
-#if defined(CONFIG_STACK_GROWSUP)
-/*
- * PA-RISC uses this for its stack.
- * vma is the last one with address > vma->vm_end. Have to extend vma.
- */
-static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next;
- unsigned long gap_addr;
- int error = 0;
- VMA_ITERATOR(vmi, mm, vma->vm_start);
-
- if (!(vma->vm_flags & VM_GROWSUP))
- return -EFAULT;
-
- mmap_assert_write_locked(mm);
-
- /* Guard against exceeding limits of the address space. */
- address &= PAGE_MASK;
- if (address >= (TASK_SIZE & PAGE_MASK))
- return -ENOMEM;
- address += PAGE_SIZE;
-
- /* Enforce stack_guard_gap */
- gap_addr = address + stack_guard_gap;
-
- /* Guard against overflow */
- if (gap_addr < address || gap_addr > TASK_SIZE)
- gap_addr = TASK_SIZE;
-
- next = find_vma_intersection(mm, vma->vm_end, gap_addr);
- if (next && vma_is_accessible(next)) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- /* Check that both stack segments have the same anon_vma? */
- }
-
- if (next)
- vma_iter_prev_range_limit(&vmi, address);
-
- vma_iter_config(&vmi, vma->vm_start, address);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- /* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma))) {
- vma_iter_free(&vmi);
- return -ENOMEM;
- }
-
- /* Lock the VMA before expanding to prevent concurrent page faults */
- vma_start_write(vma);
- /* We update the anon VMA tree. */
- anon_vma_lock_write(vma->anon_vma);
-
- /* Somebody else might have raced and expanded it already */
- if (address > vma->vm_end) {
- unsigned long size, grow;
-
- size = address - vma->vm_start;
- grow = (address - vma->vm_end) >> PAGE_SHIFT;
-
- error = -ENOMEM;
- if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, grow);
- anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_end = address;
- /* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
- anon_vma_interval_tree_post_update_vma(vma);
-
- perf_event_mmap(vma);
- }
- }
- }
- anon_vma_unlock_write(vma->anon_vma);
- vma_iter_free(&vmi);
- validate_mm(mm);
- return error;
-}
-#endif /* CONFIG_STACK_GROWSUP */
-
-/*
- * vma is the first one with address < vma->vm_start. Have to extend vma.
- * mmap_lock held for writing.
- */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *prev;
- int error = 0;
- VMA_ITERATOR(vmi, mm, vma->vm_start);
-
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return -EFAULT;
-
- mmap_assert_write_locked(mm);
-
- address &= PAGE_MASK;
- if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
- return -EPERM;
-
- /* Enforce stack_guard_gap */
- prev = vma_prev(&vmi);
- /* Check that both stack segments have the same anon_vma? */
- if (prev) {
- if (!(prev->vm_flags & VM_GROWSDOWN) &&
- vma_is_accessible(prev) &&
- (address - prev->vm_end < stack_guard_gap))
- return -ENOMEM;
- }
-
- if (prev)
- vma_iter_next_range_limit(&vmi, vma->vm_start);
-
- vma_iter_config(&vmi, address, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- /* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma))) {
- vma_iter_free(&vmi);
- return -ENOMEM;
- }
-
- /* Lock the VMA before expanding to prevent concurrent page faults */
- vma_start_write(vma);
- /* We update the anon VMA tree. */
- anon_vma_lock_write(vma->anon_vma);
-
- /* Somebody else might have raced and expanded it already */
- if (address < vma->vm_start) {
- unsigned long size, grow;
-
- size = vma->vm_end - address;
- grow = (vma->vm_start - address) >> PAGE_SHIFT;
-
- error = -ENOMEM;
- if (grow <= vma->vm_pgoff) {
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, grow);
- anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
- /* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
- anon_vma_interval_tree_post_update_vma(vma);
-
- perf_event_mmap(vma);
- }
- }
- }
- anon_vma_unlock_write(vma->anon_vma);
- vma_iter_free(&vmi);
- validate_mm(mm);
- return error;
-}
-
/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
@@ -1320,58 +1072,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- unsigned long ret;
- bool writable_file_mapping = false;
-
- /* Check to see if MDWE is applicable. */
- if (map_deny_write_exec(vm_flags, vm_flags))
- return -EACCES;
-
- /* Allow architectures to sanity-check the vm_flags. */
- if (!arch_validate_flags(vm_flags))
- return -EINVAL;
-
- /* Map writable and ensure this isn't a sealed memfd. */
- if (file && is_shared_maywrite(vm_flags)) {
- int error = mapping_map_writable(file->f_mapping);
-
- if (error)
- return error;
- writable_file_mapping = true;
- }
-
- ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
-
- /* Clear our write mapping regardless of error. */
- if (writable_file_mapping)
- mapping_unmap_writable(file->f_mapping);
-
- validate_mm(current->mm);
- return ret;
-}
-
-static int __vm_munmap(unsigned long start, size_t len, bool unlock)
-{
- int ret;
- struct mm_struct *mm = current->mm;
- LIST_HEAD(uf);
- VMA_ITERATOR(vmi, mm, start);
-
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
- if (ret || !unlock)
- mmap_write_unlock(mm);
-
- userfaultfd_unmap_complete(mm, &uf);
- return ret;
-}
-
int vm_munmap(unsigned long start, size_t len)
{
return __vm_munmap(start, len, false);
@@ -1507,88 +1207,6 @@ out:
return ret;
}
-/*
- * do_brk_flags() - Increase the brk vma if the flags match.
- * @vmi: The vma iterator
- * @addr: The start address
- * @len: The length of the increase
- * @vma: The vma,
- * @flags: The VMA Flags
- *
- * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
- * do not match then create a new anonymous VMA. Eventually we may be able to
- * do some brk-specific accounting here.
- */
-static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, unsigned long len, unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
-
- /*
- * Check against address space limits by the changed size
- * Note: This happens *after* clearing old mappings in some code paths.
- */
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
-
- if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- /*
- * Expand the existing vma if possible; Note that singular lists do not
- * occur after forking, so the expand will only happen on new VMAs.
- */
- if (vma && vma->vm_end == addr) {
- VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
-
- vmg.prev = vma;
- /* vmi is positioned at prev, which this mode expects. */
- vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
-
- if (vma_merge_new_range(&vmg))
- goto out;
- else if (vmg_nomem(&vmg))
- goto unacct_fail;
- }
-
- if (vma)
- vma_iter_next_range(vmi);
- /* create a vma struct for an anonymous mapping */
- vma = vm_area_alloc(mm);
- if (!vma)
- goto unacct_fail;
-
- vma_set_anonymous(vma);
- vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
- vm_flags_init(vma, flags);
- vma->vm_page_prot = vm_get_page_prot(flags);
- vma_start_write(vma);
- if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
- goto mas_store_fail;
-
- mm->map_count++;
- validate_mm(mm);
- ksm_add_vma(vma);
-out:
- perf_event_mmap(vma);
- mm->total_vm += len >> PAGE_SHIFT;
- mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
- vm_flags_set(vma, VM_SOFTDIRTY);
- return 0;
-
-mas_store_fail:
- vm_area_free(vma);
-unacct_fail:
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
-}
-
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
@@ -1659,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm)
goto destroy;
}
- lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
@@ -1688,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, /* unreachable = */ true);
+ vma_mark_detached(vma);
+ remove_vma(vma);
count++;
cond_resched();
vma = vma_next(&vmi);
@@ -1703,48 +1321,6 @@ destroy:
vm_unacct_memory(nr_accounted);
}
-/* Insert vm structure into process list sorted by address
- * and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_rwsem is taken here.
- */
-int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- unsigned long charged = vma_pages(vma);
-
-
- if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
- return -ENOMEM;
-
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
-
- /*
- * The vm_pgoff of a purely anonymous vma should be irrelevant
- * until its first write fault, when page's anon_vma and index
- * are set. But now set the vm_pgoff it will almost certainly
- * end up with (unless mremap moves it elsewhere before that
- * first wfault), so /proc/pid/maps tells a consistent story.
- *
- * By setting it to reflect the virtual start address of the
- * vma, merges and splits can happen in a seamless way, just
- * using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk_flags.
- */
- if (vma_is_anonymous(vma)) {
- BUG_ON(vma->anon_vma);
- vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
- }
-
- if (vma_link(mm, vma)) {
- if (vma->vm_flags & VM_ACCOUNT)
- vm_unacct_memory(charged);
- return -ENOMEM;
- }
-
- return 0;
-}
-
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
@@ -1926,8 +1502,59 @@ struct vm_area_struct *_install_special_mapping(
&special_mapping_vmops);
}
+#ifdef CONFIG_SYSCTL
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+static const struct ctl_table mmap_table[] = {
+ {
+ .procname = "max_map_count",
+ .data = &sysctl_max_map_count,
+ .maxlen = sizeof(sysctl_max_map_count),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+ {
+ .procname = "legacy_va_layout",
+ .data = &sysctl_legacy_va_layout,
+ .maxlen = sizeof(sysctl_legacy_va_layout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
+};
+#endif /* CONFIG_SYSCTL */
+
/*
- * initialise the percpu counter for VM
+ * initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@@ -1935,6 +1562,10 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", mmap_table);
+#endif
+ vma_state_init();
}
/*
@@ -2046,84 +1677,217 @@ static int __meminit init_reserve_notifier(void)
subsys_initcall(init_reserve_notifier);
/*
- * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
- * this VMA and its relocated range, which will now reside at [vma->vm_start -
- * shift, vma->vm_end - shift).
+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
+ * start of the VMA, the intent is to perform a write, and it is a
+ * downward-growing stack, then attempt to expand the stack to contain it.
+ *
+ * This function is intended only for obtaining an argument page from an ELF
+ * image, and is almost certainly NOT what you want to use for any other
+ * purpose.
+ *
+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
+ * new VMA being mapped.
+ *
+ * The function assumes that addr is either contained within the VMA or below
+ * it, and makes no attempt to validate this value beyond that.
+ *
+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
+ * false if the stack expansion failed.
*
- * This function is almost certainly NOT what you want for anything other than
- * early executable temporary stack relocation.
+ * On stack expansion the function temporarily acquires an mmap write lock
+ * before downgrading it.
*/
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
+ struct vm_area_struct *new_vma,
+ unsigned long addr, bool write)
{
- /*
- * The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
+ if (!write || addr >= new_vma->vm_start) {
+ mmap_read_lock(mm);
+ return true;
+ }
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
+ if (!(new_vma->vm_flags & VM_GROWSDOWN))
+ return false;
- BUG_ON(new_start > new_end);
+ mmap_write_lock(mm);
+ if (expand_downwards(new_vma, addr)) {
+ mmap_write_unlock(mm);
+ return false;
+ }
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
+ mmap_write_downgrade(mm);
+ return true;
+}
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- vmg.vma = vma;
- if (vma_expand(&vmg))
- return -ENOMEM;
+__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct vm_area_struct *mpnt, *tmp;
+ int retval;
+ unsigned long charge = 0;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, 0);
+ if (mmap_write_lock_killable(oldmm))
+ return -EINTR;
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
/*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
+ * Not linked in yet - no deadlock potential:
*/
- if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false, true))
- return -ENOMEM;
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
- lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
+ /* No ordering required: file already has been exposed. */
+ dup_mm_exe_file(mm, oldmm);
+
+ mm->total_vm = oldmm->total_vm;
+ mm->data_vm = oldmm->data_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
+ goto out;
+
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(vmi, mpnt) {
+ struct file *file;
+
+ vma_start_write(mpnt);
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
/*
- * when the old and new regions overlap clear from new_end.
+ * Don't duplicate many vmas if we've been oom-killed (for
+ * example)
*/
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto loop_out;
+ }
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+
+ tmp = vm_area_dup(mpnt);
+ if (!tmp)
+ goto fail_nomem;
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /*
+ * VM_WIPEONFORK gets a clean slate in the child.
+ * Don't prepare anon_vma until fault since we don't
+ * copy page for current vma.
+ */
+ tmp->anon_vma = NULL;
+ } else if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
+ /*
+ * Copy/update hugetlb private vma information.
+ */
+ if (is_vm_hugetlb_page(tmp))
+ hugetlb_dup_vma_private(tmp);
+
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
+
+ mm->map_count++;
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ file = tmp->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ i_mmap_lock_write(mapping);
+ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(tmp, mpnt);
+
+ if (retval) {
+ mpnt = vma_next(&vmi);
+ goto loop_out;
+ }
+ }
+ /* a new mm has just been created */
+ retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval) {
+ mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
} else {
+
/*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
*/
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ if (mpnt) {
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ /* Avoid OOM iterating a broken tree */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ }
+ /*
+ * The mm_struct is going to exit, but the locks will be dropped
+ * first. Set the mm_struct as unstable is advisable as it is
+ * not fully initialised.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
}
- tlb_finish_mmu(&tlb);
+out:
+ mmap_write_unlock(mm);
+ flush_tlb_mm(oldmm);
+ mmap_write_unlock(oldmm);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
+ return retval;
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ vm_area_free(tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto loop_out;
}
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index f186d57df2c6..5f725cc67334 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
-#ifdef CONFIG_MEMCG
-
-/*
- * Size of the buffer for memcg path names. Ignoring stack trace support,
- * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
- */
-#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- if (trace_mmap_lock_##type##_enabled()) { \
- char buf[MEMCG_PATH_BUF_SIZE]; \
- get_mm_memcg_path(mm, buf, sizeof(buf)); \
- trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
- } \
- } while (0)
-
-#else /* !CONFIG_MEMCG */
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
-#endif /* CONFIG_MEMCG */
-
#ifdef CONFIG_TRACING
-#ifdef CONFIG_MEMCG
-/*
- * Write the given mm_struct's memcg path to a buffer. If the path cannot be
- * determined, empty string is written.
- */
-static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
-{
- struct mem_cgroup *memcg;
-
- buf[0] = '\0';
- memcg = get_mem_cgroup_from_mm(mm);
- if (memcg == NULL)
- return;
- if (memcg->css.cgroup)
- cgroup_path(memcg->css.cgroup, buf, buflen);
- css_put(&memcg->css);
-}
-
-#endif /* CONFIG_MEMCG */
-
/*
* Trace calls must be in a separate file, as otherwise there's a circular
* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
@@ -69,20 +25,293 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
- TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+ trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
bool success)
{
- TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+ trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
- TRACE_MMAP_LOCK_EVENT(released, mm, write);
+ trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ /* Failed to lock the VMA */
+ goto inval;
+ }
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(vma->vm_mm != mm ||
+ address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
+
+ rcu_read_unlock();
+ return vma;
+
+inval_end_read:
+ vma_end_read(vma);
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
+
+#else /* CONFIG_MMU */
+
+/*
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+#endif /* CONFIG_MMU */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 99b3e9408aa0..b49cc6385f1f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -246,8 +246,16 @@ static void __tlb_remove_table_free(struct mmu_table_batch *batch)
* IRQs delays the completion of the TLB flush we can never observe an already
* freed page.
*
- * Architectures that do not have this (PPC) need to delay the freeing by some
- * other means, this is that means.
+ * Not all systems IPI every CPU for this purpose:
+ *
+ * - Some architectures have HW support for cross-CPU synchronisation of TLB
+ * flushes, so there's no IPI at all.
+ *
+ * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
+ * with the hypervisor to defer flushing on preempted vCPUs.
+ *
+ * Such systems need to delay the freeing by some other means, this is that
+ * means.
*
* What we do is batch the freed directory pages (tables) and RCU free them.
* We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
@@ -311,11 +319,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb)
}
}
-static void tlb_remove_table_one(void *table)
+#ifdef CONFIG_PT_RECLAIM
+static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ __tlb_remove_table(ptdesc);
+}
+
+static inline void __tlb_remove_table_one(void *table)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = table;
+ call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
+}
+#else
+static inline void __tlb_remove_table_one(void *table)
{
tlb_remove_table_sync_one();
__tlb_remove_table(table);
}
+#endif /* CONFIG_PT_RECLAIM */
+
+static void tlb_remove_table_one(void *table)
+{
+ __tlb_remove_table_one(table);
+}
static void tlb_table_flush(struct mmu_gather *tlb)
{
@@ -393,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
+ tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index fc18fe274505..8e0125dc0522 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <cl@linux.com>
+ * Christoph Lameter <cl@gentwo.org>
*/
#include <linux/rculist.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 516b1d847e2c..88608d0dc2c2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -133,7 +133,7 @@ static long change_pte_range(struct mmu_gather *tlb,
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
(folio_maybe_dma_pinned(folio) ||
- folio_likely_mapped_shared(folio)))
+ folio_maybe_mapped_shared(folio)))
continue;
/*
@@ -225,14 +225,6 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_writable_device_exclusive_entry(entry)) {
- entry = make_readable_device_exclusive_entry(
- swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(oldpte))
- newpte = pte_swp_mksoft_dirty(newpte);
- if (pte_swp_uffd_wp(oldpte))
- newpte = pte_swp_mkuffd_wp(newpte);
} else if (is_pte_marker_entry(entry)) {
/*
* Ignore error swap entries unconditionally,
@@ -387,7 +379,7 @@ again:
if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
/*
* For file-backed, the pmd could have been
* cleared; make sure pmd populated if
@@ -607,7 +599,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long oldflags = vma->vm_flags;
+ unsigned long oldflags = READ_ONCE(vma->vm_flags);
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
@@ -627,7 +619,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
* uncommon case, so doesn't need to be very optimized.
*/
if (arch_has_pfn_modify_check() &&
- (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & VM_ACCESS_FLAGS) == 0) {
pgprot_t new_pgprot = vm_get_page_prot(newflags);
@@ -676,7 +668,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
* held in write mode.
*/
vma_start_write(vma);
- vm_flags_reset(vma, newflags);
+ vm_flags_reset_once(vma, newflags);
if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..60f6b8d0d5f0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -32,6 +32,45 @@
#include "internal.h"
+/* Classify the kind of remap operation being performed. */
+enum mremap_type {
+ MREMAP_INVALID, /* Initial state. */
+ MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */
+ MREMAP_SHRINK, /* old_len > new_len. */
+ MREMAP_EXPAND, /* old_len < new_len. */
+};
+
+/*
+ * Describes a VMA mremap() operation and is threaded throughout it.
+ *
+ * Any of the fields may be mutated by the operation, however these values will
+ * always accurately reflect the remap (for instance, we may adjust lengths and
+ * delta to account for hugetlb alignment).
+ */
+struct vma_remap_struct {
+ /* User-provided state. */
+ unsigned long addr; /* User-specified address from which we remap. */
+ unsigned long old_len; /* Length of range being remapped. */
+ unsigned long new_len; /* Desired new length of mapping. */
+ unsigned long flags; /* user-specified MREMAP_* flags. */
+ unsigned long new_addr; /* Optionally, desired new address. */
+
+ /* uffd state. */
+ struct vm_userfaultfd_ctx *uf;
+ struct list_head *uf_unmap_early;
+ struct list_head *uf_unmap;
+
+ /* VMA state, determined in do_mremap(). */
+ struct vm_area_struct *vma;
+
+ /* Internal state, determined in do_mremap(). */
+ unsigned long delta; /* Absolute delta of old_len,new_len. */
+ bool mlocked; /* Was the VMA mlock()'d? */
+ enum mremap_type remap_type; /* expand, shrink, etc. */
+ bool mmap_locked; /* Is mm currently write-locked? */
+ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
+};
+
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
@@ -69,8 +108,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return pmd;
}
-static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -83,13 +121,12 @@ static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
return pud_alloc(mm, p4d, addr);
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
- pud = alloc_new_pud(mm, vma, addr);
+ pud = alloc_new_pud(mm, addr);
if (!pud)
return NULL;
@@ -133,16 +170,19 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- unsigned long old_addr, unsigned long old_end,
- struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr, bool need_rmap_locks)
+static int move_ptes(struct pagetable_move_control *pmc,
+ unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
{
+ struct vm_area_struct *vma = pmc->old;
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long new_addr = pmc->new_addr;
+ unsigned long old_end = old_addr + extent;
unsigned long len = old_end - old_addr;
int err = 0;
@@ -164,7 +204,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
take_rmap_locks(vma);
/*
@@ -197,6 +237,8 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
+ VM_WARN_ON_ONCE(!pte_none(*new_pte));
+
if (pte_none(ptep_get(old_pte)))
continue;
@@ -216,7 +258,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -227,7 +280,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
out:
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
drop_rmap_locks(vma);
return err;
}
@@ -242,10 +295,11 @@ static inline bool arch_supports_page_table_move(void)
#endif
#ifdef CONFIG_HAVE_MOVE_PMD
-static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
+static bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
bool res = false;
pmd_t pmd;
@@ -278,11 +332,20 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ old_ptl = pmd_lock(mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -299,7 +362,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE);
out_unlock:
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -308,19 +371,19 @@ out_unlock:
return res;
}
#else
-static inline bool move_normal_pmd(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
- pmd_t *new_pmd)
+static inline bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
-static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -333,11 +396,20 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -349,7 +421,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
pud_populate(mm, new_pud, pud_pgtable(pud));
- flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -357,19 +429,19 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static inline bool move_normal_pud(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
- pud_t *new_pud)
+static inline bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
return false;
}
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -384,7 +456,7 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -397,8 +469,8 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
/* Set the new pud */
/* mark soft_ditry when we add pud level soft dirty support */
- set_pud_at(mm, new_addr, new_pud, pud);
- flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+ set_pud_at(mm, pmc->new_addr, new_pud, pud);
+ flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -406,8 +478,9 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
+
{
WARN_ON_ONCE(1);
return false;
@@ -428,10 +501,12 @@ enum pgt_entry {
* destination pgt_entry.
*/
static __always_inline unsigned long get_extent(enum pgt_entry entry,
- unsigned long old_addr, unsigned long old_end,
- unsigned long new_addr)
+ struct pagetable_move_control *pmc)
{
unsigned long next, extent, mask, size;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long old_end = pmc->old_end;
+ unsigned long new_addr = pmc->new_addr;
switch (entry) {
case HPAGE_PMD:
@@ -461,37 +536,50 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
}
/*
+ * Should move_pgt_entry() acquire the rmap locks? This is either expressed in
+ * the PMC, or overridden in the case of normal, larger page tables.
+ */
+static bool should_take_rmap_locks(struct pagetable_move_control *pmc,
+ enum pgt_entry entry)
+{
+ switch (entry) {
+ case NORMAL_PMD:
+ case NORMAL_PUD:
+ return true;
+ default:
+ return pmc->need_rmap_locks;
+ }
+}
+
+/*
* Attempts to speedup the move by moving entry at the level corresponding to
* pgt_entry. Returns true if the move was successful, else false.
*/
-static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr,
- void *old_entry, void *new_entry, bool need_rmap_locks)
+static bool move_pgt_entry(struct pagetable_move_control *pmc,
+ enum pgt_entry entry, void *old_entry, void *new_entry)
{
bool moved = false;
+ bool need_rmap_locks = should_take_rmap_locks(pmc, entry);
/* See comment in move_ptes() */
if (need_rmap_locks)
- take_rmap_locks(vma);
+ take_rmap_locks(pmc->old);
switch (entry) {
case NORMAL_PMD:
- moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pmd(pmc, old_entry, new_entry);
break;
case NORMAL_PUD:
- moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pud(pmc, old_entry, new_entry);
break;
case HPAGE_PMD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pmd(vma, old_addr, new_addr, old_entry,
+ move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry,
new_entry);
break;
case HPAGE_PUD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ move_huge_pud(pmc, old_entry, new_entry);
break;
default:
@@ -500,7 +588,7 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
}
if (need_rmap_locks)
- drop_rmap_locks(vma);
+ drop_rmap_locks(pmc->old);
return moved;
}
@@ -511,8 +599,9 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
* the VMA that is created to span the source and destination of the move,
* so we make an exception for it.
*/
-static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
- unsigned long mask, bool for_stack)
+static bool can_align_down(struct pagetable_move_control *pmc,
+ struct vm_area_struct *vma, unsigned long addr_to_align,
+ unsigned long mask)
{
unsigned long addr_masked = addr_to_align & mask;
@@ -521,11 +610,11 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
* of the corresponding VMA, we can't align down or we will destroy part
* of the current mapping.
*/
- if (!for_stack && vma->vm_start != addr_to_align)
+ if (!pmc->for_stack && vma->vm_start != addr_to_align)
return false;
/* In the stack case we explicitly permit in-VMA alignment. */
- if (for_stack && addr_masked >= vma->vm_start)
+ if (pmc->for_stack && addr_masked >= vma->vm_start)
return true;
/*
@@ -535,163 +624,390 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
}
-/* Opportunistically realign to specified boundary for faster copy. */
-static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
- unsigned long *new_addr, struct vm_area_struct *new_vma,
- unsigned long mask, bool for_stack)
+/*
+ * Determine if are in fact able to realign for efficiency to a higher page
+ * table boundary.
+ */
+static bool can_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
{
+ unsigned long align_mask = ~pagetable_mask;
+ unsigned long old_align = pmc->old_addr & align_mask;
+ unsigned long new_align = pmc->new_addr & align_mask;
+ unsigned long pagetable_size = align_mask + 1;
+ unsigned long old_align_next = pagetable_size - old_align;
+
+ /*
+ * We don't want to have to go hunting for VMAs from the end of the old
+ * VMA to the next page table boundary, also we want to make sure the
+ * operation is wortwhile.
+ *
+ * So ensure that we only perform this realignment if the end of the
+ * range being copied reaches or crosses the page table boundary.
+ *
+ * boundary boundary
+ * .<- old_align -> .
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . <----------------.----------->
+ * . len_in
+ * <------------------------------->
+ * . pagetable_size .
+ * . <---------------->
+ * . old_align_next .
+ */
+ if (pmc->len_in < old_align_next)
+ return false;
+
/* Skip if the addresses are already aligned. */
- if ((*old_addr & ~mask) == 0)
- return;
+ if (old_align == 0)
+ return false;
/* Only realign if the new and old addresses are mutually aligned. */
- if ((*old_addr & ~mask) != (*new_addr & ~mask))
- return;
+ if (old_align != new_align)
+ return false;
/* Ensure realignment doesn't cause overlap with existing mappings. */
- if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
- !can_align_down(new_vma, *new_addr, mask, for_stack))
+ if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
+ !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
+ return false;
+
+ return true;
+}
+
+/*
+ * Opportunistically realign to specified boundary for faster copy.
+ *
+ * Consider an mremap() of a VMA with page table boundaries as below, and no
+ * preceding VMAs from the lower page table boundary to the start of the VMA,
+ * with the end of the range reaching or crossing the page table boundary.
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . pmc->old_addr . pmc->old_end
+ * . <---------------------------->
+ * . move these page tables
+ *
+ * If we proceed with moving page tables in this scenario, we will have a lot of
+ * work to do traversing old page tables and establishing new ones in the
+ * destination across multiple lower level page tables.
+ *
+ * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
+ * page table boundary, so we can simply copy a single page table entry for the
+ * aligned portion of the VMA instead:
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * pmc->old_addr . pmc->old_end
+ * <------------------------------------------->
+ * . move these page tables
+ */
+static void try_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
+{
+
+ if (!can_realign_addr(pmc, pagetable_mask))
return;
- *old_addr = *old_addr & mask;
- *new_addr = *new_addr & mask;
+ /*
+ * Simply align to page table boundaries. Note that we do NOT update the
+ * pmc->old_end value, and since the move_page_tables() operation spans
+ * from [old_addr, old_end) (offsetting new_addr as it is performed),
+ * this simply changes the start of the copy, not the end.
+ */
+ pmc->old_addr &= pagetable_mask;
+ pmc->new_addr &= pagetable_mask;
}
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack)
+/* Is the page table move operation done? */
+static bool pmc_done(struct pagetable_move_control *pmc)
{
- unsigned long extent, old_end;
+ return pmc->old_addr >= pmc->old_end;
+}
+
+/* Advance to the next page table, offset by extent bytes. */
+static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent)
+{
+ pmc->old_addr += extent;
+ pmc->new_addr += extent;
+}
+
+/*
+ * Determine how many bytes in the specified input range have had their page
+ * tables moved so far.
+ */
+static unsigned long pmc_progress(struct pagetable_move_control *pmc)
+{
+ unsigned long orig_old_addr = pmc->old_end - pmc->len_in;
+ unsigned long old_addr = pmc->old_addr;
+
+ /*
+ * Prevent negative return values when {old,new}_addr was realigned but
+ * we broke out of the loop in move_page_tables() for the first PMD
+ * itself.
+ */
+ return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr;
+}
+
+unsigned long move_page_tables(struct pagetable_move_control *pmc)
+{
+ unsigned long extent;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
pud_t *old_pud, *new_pud;
+ struct mm_struct *mm = pmc->old->vm_mm;
- if (!len)
+ if (!pmc->len_in)
return 0;
- old_end = old_addr + len;
-
- if (is_vm_hugetlb_page(vma))
- return move_hugetlb_page_tables(vma, new_vma, old_addr,
- new_addr, len);
+ if (is_vm_hugetlb_page(pmc->old))
+ return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
+ pmc->new_addr, pmc->len_in);
/*
* If possible, realign addresses to PMD boundary for faster copy.
* Only realign if the mremap copying hits a PMD boundary.
*/
- if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
- try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
- for_stack);
+ try_realign_addr(pmc, PMD_MASK);
- flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
- old_addr, old_end);
+ flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm,
+ pmc->old_addr, pmc->old_end);
mmu_notifier_invalidate_range_start(&range);
- for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
+ for (; !pmc_done(pmc); pmc_next(pmc, extent)) {
cond_resched();
/*
* If extent is PUD-sized try to speed up the move by moving at the
* PUD level if possible.
*/
- extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+ extent = get_extent(NORMAL_PUD, pmc);
- old_pud = get_old_pud(vma->vm_mm, old_addr);
+ old_pud = get_old_pud(mm, pmc->old_addr);
if (!old_pud)
continue;
- new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ new_pud = alloc_new_pud(mm, pmc->new_addr);
if (!new_pud)
break;
if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
- move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, need_rmap_locks);
+ move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
/* We ignore and continue on error? */
continue;
}
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-
- if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, true))
+ if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud))
continue;
}
- extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
- old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ extent = get_extent(NORMAL_PMD, pmc);
+ old_pmd = get_old_pmd(mm, pmc->old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
+ new_pmd = alloc_new_pmd(mm, pmc->new_addr);
if (!new_pmd)
break;
again:
if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
- move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, need_rmap_locks))
+ move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
- split_huge_pmd(vma, old_pmd, old_addr);
+ split_huge_pmd(pmc->old, old_pmd, pmc->old_addr);
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
extent == PMD_SIZE) {
/*
* If the extent is PMD-sized, try to speed the move by
* moving at the PMD level if possible.
*/
- if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, true))
+ if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd))
continue;
}
if (pmd_none(*old_pmd))
continue;
- if (pte_alloc(new_vma->vm_mm, new_pmd))
+ if (pte_alloc(pmc->new->vm_mm, new_pmd))
break;
- if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0)
goto again;
}
mmu_notifier_invalidate_range_end(&range);
+ return pmc_progress(pmc);
+}
+
+/* Set vrm->delta to the difference in VMA size specified by user. */
+static void vrm_set_delta(struct vma_remap_struct *vrm)
+{
+ vrm->delta = abs_diff(vrm->old_len, vrm->new_len);
+}
+
+/* Determine what kind of remap this is - shrink, expand or no resize at all. */
+static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm)
+{
+ if (vrm->delta == 0)
+ return MREMAP_NO_RESIZE;
+
+ if (vrm->old_len > vrm->new_len)
+ return MREMAP_SHRINK;
+
+ return MREMAP_EXPAND;
+}
+
+/*
+ * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs
+ * overlapping?
+ */
+static bool vrm_overlaps(struct vma_remap_struct *vrm)
+{
+ unsigned long start_old = vrm->addr;
+ unsigned long start_new = vrm->new_addr;
+ unsigned long end_old = vrm->addr + vrm->old_len;
+ unsigned long end_new = vrm->new_addr + vrm->new_len;
+
/*
- * Prevent negative return values when {old,new}_addr was realigned
- * but we broke out of the above loop for the first PMD itself.
+ * start_old end_old
+ * |-----------|
+ * | |
+ * |-----------|
+ * |-------------|
+ * | |
+ * |-------------|
+ * start_new end_new
*/
- if (old_addr < old_end - len)
- return 0;
+ if (end_old > start_new && end_new > start_old)
+ return true;
- return len + old_addr - old_end; /* how much done */
+ return false;
}
-static unsigned long move_vma(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr,
- bool *locked, unsigned long flags,
- struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
+/* Do the mremap() flags require that the new_addr parameter be specified? */
+static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
{
- long to_account = new_len - old_len;
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *new_vma;
- unsigned long vm_flags = vma->vm_flags;
- unsigned long new_pgoff;
- unsigned long moved_len;
- unsigned long account_start = 0;
- unsigned long account_end = 0;
- unsigned long hiwater_vm;
- int err = 0;
- bool need_rmap_locks;
- struct vma_iterator vmi;
+ return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
+}
+
+/*
+ * Find an unmapped area for the requested vrm->new_addr.
+ *
+ * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only
+ * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to
+ * mmap(), otherwise this is equivalent to mmap() specifying a NULL address.
+ *
+ * Returns 0 on success (with vrm->new_addr updated), or an error code upon
+ * failure.
+ */
+static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long map_flags = 0;
+ /* Page Offset _into_ the VMA. */
+ pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff_t pgoff = vma->vm_pgoff + internal_pgoff;
+ unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0;
+ unsigned long res;
+
+ if (vrm->flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff,
+ map_flags);
+ if (IS_ERR_VALUE(res))
+ return res;
+
+ vrm->new_addr = res;
+ return 0;
+}
+
+/*
+ * Keep track of pages which have been added to the memory mapping. If the VMA
+ * is accounted, also check to see if there is sufficient memory.
+ *
+ * Returns true on success, false if insufficient memory to charge.
+ */
+static bool vrm_charge(struct vma_remap_struct *vrm)
+{
+ unsigned long charged;
+
+ if (!(vrm->vma->vm_flags & VM_ACCOUNT))
+ return true;
+
+ /*
+ * If we don't unmap the old mapping, then we account the entirety of
+ * the length of the new one. Otherwise it's just the delta in size.
+ */
+ if (vrm->flags & MREMAP_DONTUNMAP)
+ charged = vrm->new_len >> PAGE_SHIFT;
+ else
+ charged = vrm->delta >> PAGE_SHIFT;
+
+
+ /* This accounts 'charged' pages of memory. */
+ if (security_vm_enough_memory_mm(current->mm, charged))
+ return false;
+
+ vrm->charged = charged;
+ return true;
+}
+
+/*
+ * an error has occurred so we will not be using vrm->charged memory. Unaccount
+ * this memory if the VMA is accounted.
+ */
+static void vrm_uncharge(struct vma_remap_struct *vrm)
+{
+ if (!(vrm->vma->vm_flags & VM_ACCOUNT))
+ return;
+
+ vm_unacct_memory(vrm->charged);
+ vrm->charged = 0;
+}
+
+/*
+ * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to
+ * account for 'bytes' memory used, and if locked, indicate this in the VRM so
+ * we can handle this correctly later.
+ */
+static void vrm_stat_account(struct vma_remap_struct *vrm,
+ unsigned long bytes)
+{
+ unsigned long pages = bytes >> PAGE_SHIFT;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+
+ vm_stat_account(mm, vma->vm_flags, pages);
+ if (vma->vm_flags & VM_LOCKED) {
+ mm->locked_vm += pages;
+ vrm->mlocked = true;
+ }
+}
+
+/*
+ * Perform checks before attempting to write a VMA prior to it being
+ * moved.
+ */
+static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
+{
+ unsigned long err = 0;
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long old_addr = vrm->addr;
+ unsigned long old_len = vrm->old_len;
+ unsigned long dummy = vma->vm_flags;
/*
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
- if (mm->map_count >= sysctl_max_map_count - 3)
+ if (current->mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
- if (unlikely(flags & MREMAP_DONTUNMAP))
- to_account = new_len;
-
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
err = vma->vm_ops->may_split(vma, old_addr);
@@ -709,61 +1025,234 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* so KSM can come around to merge on vma and new_vma afterwards.
*/
err = ksm_madvise(vma, old_addr, old_addr + old_len,
- MADV_UNMERGEABLE, &vm_flags);
+ MADV_UNMERGEABLE, &dummy);
if (err)
return err;
- if (vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
- return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Unmap source VMA for VMA move, turning it from a copy to a move, being
+ * careful to ensure we do not underflow memory account while doing so if an
+ * accountable move.
+ *
+ * This is best effort, if we fail to unmap then we simply try to correct
+ * accounting and exit.
+ */
+static void unmap_source_vma(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = vrm->addr;
+ unsigned long len = vrm->old_len;
+ struct vm_area_struct *vma = vrm->vma;
+ VMA_ITERATOR(vmi, mm, addr);
+ int err;
+ unsigned long vm_start;
+ unsigned long vm_end;
+ /*
+ * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
+ * function implies that we unmap the original VMA, which seems
+ * contradictory.
+ *
+ * However, this occurs when this operation was attempted and an error
+ * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
+ * we actually _do_ want it be unaccounted.
+ */
+ bool accountable_move = (vma->vm_flags & VM_ACCOUNT) &&
+ !(vrm->flags & MREMAP_DONTUNMAP);
+
+ /*
+ * So we perform a trick here to prevent incorrect accounting. Any merge
+ * or new VMA allocation performed in copy_vma() does not adjust
+ * accounting, it is expected that callers handle this.
+ *
+ * And indeed we already have, accounting appropriately in the case of
+ * both in vrm_charge().
+ *
+ * However, when we unmap the existing VMA (to effect the move), this
+ * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
+ * removed pages.
+ *
+ * To avoid this we temporarily clear this flag, reinstating on any
+ * portions of the original VMA that remain.
+ */
+ if (accountable_move) {
+ vm_flags_clear(vma, VM_ACCOUNT);
+ /* We are about to split vma, so store the start/end. */
+ vm_start = vma->vm_start;
+ vm_end = vma->vm_end;
+ }
+
+ err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
+ vrm->vma = NULL; /* Invalidated. */
+ if (err) {
+ /* OOM: unable to split vma, just get accounts right */
+ vm_acct_memory(len >> PAGE_SHIFT);
+ return;
}
- vma_start_write(vma);
- new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
- &need_rmap_locks);
+ /*
+ * If we mremap() from a VMA like this:
+ *
+ * addr end
+ * | |
+ * v v
+ * |-------------|
+ * | |
+ * |-------------|
+ *
+ * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
+ * we'll end up with:
+ *
+ * addr end
+ * | |
+ * v v
+ * |---| |---|
+ * | A | | B |
+ * |---| |---|
+ *
+ * The VMI is still pointing at addr, so vma_prev() will give us A, and
+ * a subsequent or lone vma_next() will give as B.
+ *
+ * do_vmi_munmap() will have restored the VMI back to addr.
+ */
+ if (accountable_move) {
+ unsigned long end = addr + len;
+
+ if (vm_start < addr) {
+ struct vm_area_struct *prev = vma_prev(&vmi);
+
+ vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */
+ }
+
+ if (vm_end > end) {
+ struct vm_area_struct *next = vma_next(&vmi);
+
+ vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */
+ }
+ }
+}
+
+/*
+ * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
+ * process. Additionally handle an error occurring on moving of page tables,
+ * where we reset vrm state to cause unmapping of the new VMA.
+ *
+ * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
+ * error code.
+ */
+static int copy_vma_and_data(struct vma_remap_struct *vrm,
+ struct vm_area_struct **new_vma_ptr)
+{
+ unsigned long internal_offset = vrm->addr - vrm->vma->vm_start;
+ unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
+ unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
+ unsigned long moved_len;
+ struct vm_area_struct *vma = vrm->vma;
+ struct vm_area_struct *new_vma;
+ int err = 0;
+ PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
+
+ new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
+ &pmc.need_rmap_locks);
if (!new_vma) {
- if (vm_flags & VM_ACCOUNT)
- vm_unacct_memory(to_account >> PAGE_SHIFT);
+ vrm_uncharge(vrm);
+ *new_vma_ptr = NULL;
return -ENOMEM;
}
+ vrm->vma = vma;
+ pmc.old = vma;
+ pmc.new = new_vma;
- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
- need_rmap_locks, false);
- if (moved_len < old_len) {
+ moved_len = move_page_tables(&pmc);
+ if (moved_len < vrm->old_len)
err = -ENOMEM;
- } else if (vma->vm_ops && vma->vm_ops->mremap) {
+ else if (vma->vm_ops && vma->vm_ops->mremap)
err = vma->vm_ops->mremap(new_vma);
- }
if (unlikely(err)) {
+ PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
+ vrm->addr, moved_len);
+
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
- true, false);
- vma = new_vma;
- old_len = new_len;
- old_addr = new_addr;
- new_addr = err;
+ pmc_revert.need_rmap_locks = true;
+ move_page_tables(&pmc_revert);
+
+ vrm->vma = new_vma;
+ vrm->old_len = vrm->new_len;
+ vrm->addr = vrm->new_addr;
} else {
- mremap_userfaultfd_prep(new_vma, uf);
+ mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma)) {
- clear_vma_resv_huge_pages(vma);
- }
+ fixup_hugetlb_reservations(vma);
- /* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
- vm_flags_clear(vma, VM_ACCOUNT);
- if (vma->vm_start < old_addr)
- account_start = vma->vm_start;
- if (vma->vm_end > old_addr + old_len)
- account_end = vma->vm_end;
- }
+ *new_vma_ptr = new_vma;
+ return err;
+}
+
+/*
+ * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and
+ * account flags on remaining VMA by convention (it cannot be mlock()'d any
+ * longer, as pages in range are no longer mapped), and removing anon_vma_chain
+ * links from it (if the entire VMA was copied over).
+ */
+static void dontunmap_complete(struct vma_remap_struct *vrm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long start = vrm->addr;
+ unsigned long end = vrm->addr + vrm->old_len;
+ unsigned long old_start = vrm->vma->vm_start;
+ unsigned long old_end = vrm->vma->vm_end;
+
+ /*
+ * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old
+ * vma.
+ */
+ vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT);
+
+ /*
+ * anon_vma links of the old vma is no longer needed after its page
+ * table has been moved.
+ */
+ if (new_vma != vrm->vma && start == old_start && end == old_end)
+ unlink_anon_vmas(vrm->vma);
+
+ /* Because we won't unmap we don't need to touch locked_vm. */
+}
+
+static unsigned long move_vma(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *new_vma;
+ unsigned long hiwater_vm;
+ int err;
+
+ err = prep_move_vma(vrm);
+ if (err)
+ return err;
+
+ /* If accounted, charge the number of bytes the operation will use. */
+ if (!vrm_charge(vrm))
+ return -ENOMEM;
+
+ /* We don't want racing faults. */
+ vma_start_write(vrm->vma);
+
+ /* Perform copy step. */
+ err = copy_vma_and_data(vrm, &new_vma);
+ /*
+ * If we established the copied-to VMA, we attempt to recover from the
+ * error by setting the destination VMA to the source VMA and unmapping
+ * it below.
+ */
+ if (err && !new_vma)
+ return err;
/*
* If we failed to move page tables we still do total_vm increment
@@ -775,73 +1264,31 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
-
- /* Tell pfnmap has moved from this vma */
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(vma);
-
- if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
- /* We always clear VM_LOCKED[ONFAULT] on the old vma */
- vm_flags_clear(vma, VM_LOCKED_MASK);
-
- /*
- * anon_vma links of the old vma is no longer needed after its page
- * table has been moved.
- */
- if (new_vma != vma && vma->vm_start == old_addr &&
- vma->vm_end == (old_addr + old_len))
- unlink_anon_vmas(vma);
-
- /* Because we won't unmap we don't need to touch locked_vm */
- return new_addr;
- }
-
- vma_iter_init(&vmi, mm, old_addr);
- if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
- /* OOM: unable to split vma, just get accounts right */
- if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
- vm_acct_memory(old_len >> PAGE_SHIFT);
- account_start = account_end = 0;
- }
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
+ vrm_stat_account(vrm, vrm->new_len);
+ if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP)))
+ dontunmap_complete(vrm, new_vma);
+ else
+ unmap_source_vma(vrm);
mm->hiwater_vm = hiwater_vm;
- /* Restore VM_ACCOUNT if one or two pieces of vma left */
- if (account_start) {
- vma = vma_prev(&vmi);
- vm_flags_set(vma, VM_ACCOUNT);
- }
-
- if (account_end) {
- vma = vma_next(&vmi);
- vm_flags_set(vma, VM_ACCOUNT);
- }
-
- return new_addr;
+ return err ? (unsigned long)err : vrm->new_addr;
}
/*
* resize_is_valid() - Ensure the vma can be resized to the new length at the give
* address.
*
- * @vma: The vma to resize
- * @addr: The old address
- * @old_len: The current size
- * @new_len: The desired size
- * @flags: The vma flags
- *
* Return 0 on success, error otherwise.
*/
-static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long flags)
+static int resize_is_valid(struct vma_remap_struct *vrm)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long addr = vrm->addr;
+ unsigned long old_len = vrm->old_len;
+ unsigned long new_len = vrm->new_len;
unsigned long pgoff;
/*
@@ -853,11 +1300,12 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
* behavior. As a result, fail such attempts.
*/
if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
- pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
+ current->comm, current->pid);
return -EINVAL;
}
- if ((flags & MREMAP_DONTUNMAP) &&
+ if ((vrm->flags & MREMAP_DONTUNMAP) &&
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return -EINVAL;
@@ -877,118 +1325,120 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return -EFAULT;
- if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
+ if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
return -EAGAIN;
- if (!may_expand_vm(mm, vma->vm_flags,
- (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
return -ENOMEM;
return 0;
}
/*
- * mremap_to() - remap a vma to a new location
- * @addr: The old address
- * @old_len: The old size
- * @new_addr: The target address
- * @new_len: The new size
- * @locked: If the returned vma is locked (VM_LOCKED)
- * @flags: the mremap flags
- * @uf: The mremap userfaultfd context
- * @uf_unmap_early: The userfaultfd unmap early context
- * @uf_unmap: The userfaultfd unmap context
+ * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
+ * execute this, optionally dropping the mmap lock when we do so.
*
- * Returns: The new address of the vma or an error.
+ * In both cases this invalidates the VMA, however if we don't drop the lock,
+ * then load the correct VMA into vrm->vma afterwards.
*/
-static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked,
- unsigned long flags, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap_early,
- struct list_head *uf_unmap)
+static unsigned long shrink_vma(struct vma_remap_struct *vrm,
+ bool drop_lock)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret;
- unsigned long map_flags = 0;
+ unsigned long unmap_start = vrm->addr + vrm->new_len;
+ unsigned long unmap_bytes = vrm->delta;
+ unsigned long res;
+ VMA_ITERATOR(vmi, mm, unmap_start);
- if (offset_in_page(new_addr))
- return -EINVAL;
+ VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK);
- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- return -EINVAL;
-
- /* Ensure the old/new locations do not overlap */
- if (addr + old_len > new_addr && new_addr + new_len > addr)
- return -EINVAL;
+ res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes,
+ vrm->uf_unmap, drop_lock);
+ vrm->vma = NULL; /* Invalidated. */
+ if (res)
+ return res;
/*
- * move_vma() need us to stay 4 maps below the threshold, otherwise
- * it will bail out at the very beginning.
- * That is a problem if we have already unmaped the regions here
- * (new_addr, and old_addr), because userspace will not know the
- * state of the vma's after it gets -ENOMEM.
- * So, to avoid such scenario we can pre-compute if the whole
- * operation has high chances to success map-wise.
- * Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmapping it.
- * That means 2 more maps (1 for each) to the ones we already hold.
- * Check whether current map count plus 2 still leads us to 4 maps below
- * the threshold, otherwise return -ENOMEM here to be more safe.
+ * If we've not dropped the lock, then we should reload the VMA to
+ * replace the invalidated VMA with the one that may have now been
+ * split.
*/
- if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
- return -ENOMEM;
+ if (drop_lock) {
+ vrm->mmap_locked = false;
+ } else {
+ vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vrm->vma)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * mremap_to() - remap a vma to a new location.
+ * Returns: The new address of the vma or an error.
+ */
+static unsigned long mremap_to(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long err;
+
+ /* Is the new length or address silly? */
+ if (vrm->new_len > TASK_SIZE ||
+ vrm->new_addr > TASK_SIZE - vrm->new_len)
+ return -EINVAL;
+
+ if (vrm_overlaps(vrm))
+ return -EINVAL;
- if (flags & MREMAP_FIXED) {
+ if (vrm->flags & MREMAP_FIXED) {
/*
* In mremap_to().
* VMA is moved to dst address, and munmap dst first.
* do_munmap will check if dst is sealed.
*/
- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
- if (ret)
- return ret;
- }
+ err = do_munmap(mm, vrm->new_addr, vrm->new_len,
+ vrm->uf_unmap_early);
+ vrm->vma = NULL; /* Invalidated. */
+ if (err)
+ return err;
- if (old_len > new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
- if (ret)
- return ret;
- old_len = new_len;
+ /*
+ * If we remap a portion of a VMA elsewhere in the same VMA,
+ * this can invalidate the old VMA. Reset.
+ */
+ vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vrm->vma)
+ return -EFAULT;
}
- vma = vma_lookup(mm, addr);
- if (!vma)
- return -EFAULT;
-
- ret = resize_is_valid(vma, addr, old_len, new_len, flags);
- if (ret)
- return ret;
+ if (vrm->remap_type == MREMAP_SHRINK) {
+ err = shrink_vma(vrm, /* drop_lock= */false);
+ if (err)
+ return err;
- /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
- if (flags & MREMAP_DONTUNMAP &&
- !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
- return -ENOMEM;
+ /* Set up for the move now shrink has been executed. */
+ vrm->old_len = vrm->new_len;
}
- if (flags & MREMAP_FIXED)
- map_flags |= MAP_FIXED;
+ err = resize_is_valid(vrm);
+ if (err)
+ return err;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
+ if (vrm->flags & MREMAP_DONTUNMAP) {
+ vm_flags_t vm_flags = vrm->vma->vm_flags;
+ unsigned long pages = vrm->old_len >> PAGE_SHIFT;
- ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
- ((addr - vma->vm_start) >> PAGE_SHIFT),
- map_flags);
- if (IS_ERR_VALUE(ret))
- return ret;
+ if (!may_expand_vm(mm, vm_flags, pages))
+ return -ENOMEM;
+ }
- /* We got a new mapping */
- if (!(flags & MREMAP_FIXED))
- new_addr = ret;
+ err = vrm_set_new_addr(vrm);
+ if (err)
+ return err;
- return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags,
- uf, uf_unmap);
+ return move_vma(vrm);
}
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
@@ -1005,215 +1455,329 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
return 1;
}
+/* Determine whether we are actually able to execute an in-place expansion. */
+static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
+{
+ /* Number of bytes from vrm->addr to end of VMA. */
+ unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr;
+
+ /* If end of range aligns to end of VMA, we can just expand in-place. */
+ if (suffix_bytes != vrm->old_len)
+ return false;
+
+ /* Check whether this is feasible. */
+ if (!vma_expandable(vrm->vma, vrm->delta))
+ return false;
+
+ return true;
+}
+
/*
- * Expand (or shrink) an existing mapping, potentially moving it at the
- * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
- *
- * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
- * This option implies MREMAP_MAYMOVE.
+ * Are the parameters passed to mremap() valid? If so return 0, otherwise return
+ * error.
*/
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
- unsigned long, new_len, unsigned long, flags,
- unsigned long, new_addr)
+static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
+
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret = -EINVAL;
- bool locked = false;
- struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
- LIST_HEAD(uf_unmap_early);
- LIST_HEAD(uf_unmap);
+ unsigned long addr = vrm->addr;
+ unsigned long flags = vrm->flags;
+
+ /* Ensure no unexpected flag values. */
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
+ return -EINVAL;
+
+ /* Start address must be page-aligned. */
+ if (offset_in_page(addr))
+ return -EINVAL;
/*
- * There is a deliberate asymmetry here: we strip the pointer tag
- * from the old address but leave the new address alone. This is
- * for consistency with mmap(), where we prevent the creation of
- * aliasing mappings in userspace by leaving the tag bits of the
- * mapping address intact. A non-zero tag will cause the subsequent
- * range checks to reject the address as invalid.
- *
- * See Documentation/arch/arm64/tagged-address-abi.rst for more
- * information.
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
*/
- addr = untagged_addr(addr);
+ if (!PAGE_ALIGN(vrm->new_len))
+ return -EINVAL;
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
- return ret;
+ /* Remainder of checks are for cases with specific new_addr. */
+ if (!vrm_implies_new_addr(vrm))
+ return 0;
- if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
- return ret;
+ /* The new address must be page-aligned. */
+ if (offset_in_page(vrm->new_addr))
+ return -EINVAL;
+
+ /* A fixed address implies a move. */
+ if (!(flags & MREMAP_MAYMOVE))
+ return -EINVAL;
+
+ /* MREMAP_DONTUNMAP does not allow resizing in the process. */
+ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
+ return -EINVAL;
/*
- * MREMAP_DONTUNMAP is always a move and it does not allow resizing
- * in the process.
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmapping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
*/
- if (flags & MREMAP_DONTUNMAP &&
- (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
- return ret;
+ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+ return 0;
+}
- if (offset_in_page(addr))
- return ret;
+/*
+ * We know we can expand the VMA in-place by delta pages, so do so.
+ *
+ * If we discover the VMA is locked, update mm_struct statistics accordingly and
+ * indicate so to the caller.
+ */
+static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
- old_len = PAGE_ALIGN(old_len);
- new_len = PAGE_ALIGN(new_len);
+ if (!vrm_charge(vrm))
+ return -ENOMEM;
/*
- * We allow a zero old-len as a special case
- * for DOS-emu "duplicate shm area" thing. But
- * a zero new-len is nonsensical.
+ * Function vma_merge_extend() is called on the
+ * extension we are adding to the already existing vma,
+ * vma_merge_extend() will merge this extension with the
+ * already existing vma (expand operation itself) and
+ * possibly also with the next vma if it becomes
+ * adjacent to the expanded vma and otherwise
+ * compatible.
*/
- if (!new_len)
- return ret;
-
- if (mmap_write_lock_killable(current->mm))
- return -EINTR;
- vma = vma_lookup(mm, addr);
+ vma = vma_merge_extend(&vmi, vma, vrm->delta);
if (!vma) {
- ret = -EFAULT;
- goto out;
+ vrm_uncharge(vrm);
+ return -ENOMEM;
}
+ vrm->vma = vma;
- /* Don't allow remapping vmas when they have already been sealed */
- if (!can_modify_vma(vma)) {
- ret = -EPERM;
- goto out;
- }
+ vrm_stat_account(vrm, vrm->delta);
- if (is_vm_hugetlb_page(vma)) {
- struct hstate *h __maybe_unused = hstate_vma(vma);
+ return 0;
+}
+
+static bool align_hugetlb(struct vma_remap_struct *vrm)
+{
+ struct hstate *h __maybe_unused = hstate_vma(vrm->vma);
+
+ vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h));
+ vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h));
+
+ /* addrs must be huge page aligned */
+ if (vrm->addr & ~huge_page_mask(h))
+ return false;
+ if (vrm->new_addr & ~huge_page_mask(h))
+ return false;
+
+ /*
+ * Don't allow remap expansion, because the underlying hugetlb
+ * reservation is not yet capable to handle split reservation.
+ */
+ if (vrm->new_len > vrm->old_len)
+ return false;
- old_len = ALIGN(old_len, huge_page_size(h));
- new_len = ALIGN(new_len, huge_page_size(h));
+ vrm_set_delta(vrm);
- /* addrs must be huge page aligned */
- if (addr & ~huge_page_mask(h))
- goto out;
- if (new_addr & ~huge_page_mask(h))
- goto out;
+ return true;
+}
+
+/*
+ * We are mremap()'ing without specifying a fixed address to move to, but are
+ * requesting that the VMA's size be increased.
+ *
+ * Try to do so in-place, if this fails, then move the VMA to a new location to
+ * action the change.
+ */
+static unsigned long expand_vma(struct vma_remap_struct *vrm)
+{
+ unsigned long err;
+ unsigned long addr = vrm->addr;
+
+ err = resize_is_valid(vrm);
+ if (err)
+ return err;
+
+ /*
+ * [addr, old_len) spans precisely to the end of the VMA, so try to
+ * expand it in-place.
+ */
+ if (vrm_can_expand_in_place(vrm)) {
+ err = expand_vma_in_place(vrm);
+ if (err)
+ return err;
/*
- * Don't allow remap expansion, because the underlying hugetlb
- * reservation is not yet capable to handle split reservation.
+ * We want to populate the newly expanded portion of the VMA to
+ * satisfy the expectation that mlock()'ing a VMA maintains all
+ * of its pages in memory.
*/
- if (new_len > old_len)
- goto out;
- }
+ if (vrm->mlocked)
+ vrm->new_addr = addr;
- if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
- ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, flags, &uf, &uf_unmap_early,
- &uf_unmap);
- goto out;
+ /* OK we're done! */
+ return addr;
}
/*
- * Always allow a shrinking remap: that just unmaps
- * the unnecessary pages..
- * do_vmi_munmap does all the needed commit accounting, and
- * unlocks the mmap_lock if so directed.
+ * We weren't able to just expand or shrink the area,
+ * we need to create a new one and move it.
*/
- if (old_len >= new_len) {
- VMA_ITERATOR(vmi, mm, addr + new_len);
- if (old_len == new_len) {
- ret = addr;
- goto out;
- }
+ /* We're not allowed to move the VMA, so error out. */
+ if (!(vrm->flags & MREMAP_MAYMOVE))
+ return -ENOMEM;
+
+ /* Find a new location to move the VMA to. */
+ err = vrm_set_new_addr(vrm);
+ if (err)
+ return err;
+
+ return move_vma(vrm);
+}
+
+/*
+ * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the
+ * first available address to perform the operation.
+ */
+static unsigned long mremap_at(struct vma_remap_struct *vrm)
+{
+ unsigned long res;
- ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
- &uf_unmap, true);
- if (ret)
- goto out;
+ switch (vrm->remap_type) {
+ case MREMAP_INVALID:
+ break;
+ case MREMAP_NO_RESIZE:
+ /* NO-OP CASE - resizing to the same size. */
+ return vrm->addr;
+ case MREMAP_SHRINK:
+ /*
+ * SHRINK CASE. Can always be done in-place.
+ *
+ * Simply unmap the shrunken portion of the VMA. This does all
+ * the needed commit accounting, and we indicate that the mmap
+ * lock should be dropped.
+ */
+ res = shrink_vma(vrm, /* drop_lock= */true);
+ if (res)
+ return res;
- ret = addr;
- goto out_unlocked;
+ return vrm->addr;
+ case MREMAP_EXPAND:
+ return expand_vma(vrm);
}
- /*
- * Ok, we need to grow..
- */
- ret = resize_is_valid(vma, addr, old_len, new_len, flags);
+ BUG();
+}
+
+static unsigned long do_mremap(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret;
+
+ ret = check_mremap_params(vrm);
if (ret)
- goto out;
+ return ret;
- /* old_len exactly to the end of the area..
- */
- if (old_len == vma->vm_end - addr) {
- unsigned long delta = new_len - old_len;
-
- /* can we just expand the current mapping? */
- if (vma_expandable(vma, delta)) {
- long pages = delta >> PAGE_SHIFT;
- VMA_ITERATOR(vmi, mm, vma->vm_end);
- long charged = 0;
-
- if (vma->vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, pages)) {
- ret = -ENOMEM;
- goto out;
- }
- charged = pages;
- }
+ vrm->old_len = PAGE_ALIGN(vrm->old_len);
+ vrm->new_len = PAGE_ALIGN(vrm->new_len);
+ vrm_set_delta(vrm);
- /*
- * Function vma_merge_extend() is called on the
- * extension we are adding to the already existing vma,
- * vma_merge_extend() will merge this extension with the
- * already existing vma (expand operation itself) and
- * possibly also with the next vma if it becomes
- * adjacent to the expanded vma and otherwise
- * compatible.
- */
- vma = vma_merge_extend(&vmi, vma, delta);
- if (!vma) {
- vm_unacct_memory(charged);
- ret = -ENOMEM;
- goto out;
- }
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ vrm->mmap_locked = true;
- vm_stat_account(mm, vma->vm_flags, pages);
- if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
- locked = true;
- new_addr = addr;
- }
- ret = addr;
- goto out;
- }
+ vma = vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vma) {
+ ret = -EFAULT;
+ goto out;
}
- /*
- * We weren't able to just expand or shrink the area,
- * we need to create a new one and move it..
- */
- ret = -ENOMEM;
- if (flags & MREMAP_MAYMOVE) {
- unsigned long map_flags = 0;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
-
- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
- vma->vm_pgoff +
- ((addr - vma->vm_start) >> PAGE_SHIFT),
- map_flags);
- if (IS_ERR_VALUE(new_addr)) {
- ret = new_addr;
- goto out;
- }
+ /* If mseal()'d, mremap() is prohibited. */
+ if (!can_modify_vma(vma)) {
+ ret = -EPERM;
+ goto out;
+ }
- ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, flags, &uf, &uf_unmap);
+ /* Align to hugetlb page size, if required. */
+ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) {
+ ret = -EINVAL;
+ goto out;
}
+
+ vrm->remap_type = vrm_remap_type(vrm);
+
+ /* Actually execute mremap. */
+ ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
+
out:
- if (offset_in_page(ret))
- locked = false;
- mmap_write_unlock(current->mm);
- if (locked && new_len > old_len)
- mm_populate(new_addr + old_len, new_len - old_len);
-out_unlocked:
- userfaultfd_unmap_complete(mm, &uf_unmap_early);
- mremap_userfaultfd_complete(&uf, addr, ret, old_len);
- userfaultfd_unmap_complete(mm, &uf_unmap);
+ if (vrm->mmap_locked) {
+ mmap_write_unlock(mm);
+ vrm->mmap_locked = false;
+
+ if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len)
+ mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
+ }
+
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
+ mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len);
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+
return ret;
}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
+{
+ struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap_early);
+ LIST_HEAD(uf_unmap);
+ /*
+ * There is a deliberate asymmetry here: we strip the pointer tag
+ * from the old address but leave the new address alone. This is
+ * for consistency with mmap(), where we prevent the creation of
+ * aliasing mappings in userspace by leaving the tag bits of the
+ * mapping address intact. A non-zero tag will cause the subsequent
+ * range checks to reject the address as invalid.
+ *
+ * See Documentation/arch/arm64/tagged-address-abi.rst for more
+ * information.
+ */
+ struct vma_remap_struct vrm = {
+ .addr = untagged_addr(addr),
+ .old_len = old_len,
+ .new_len = new_len,
+ .flags = flags,
+ .new_addr = new_addr,
+
+ .uf = &uf,
+ .uf_unmap_early = &uf_unmap_early,
+ .uf_unmap = &uf_unmap,
+
+ .remap_type = MREMAP_INVALID, /* We set later. */
+ };
+
+ return do_mremap(&vrm);
+}
diff --git a/mm/mseal.c b/mm/mseal.c
index 81d6e980e8a9..c27197ac04e8 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
unsigned long end;
struct mm_struct *mm = current->mm;
- ret = can_do_mseal(flags);
- if (ret)
- return ret;
+ /* Verify flags not set. */
+ if (flags)
+ return -EINVAL;
start = untagged_addr(start);
if (!PAGE_ALIGNED(start))
diff --git a/mm/nommu.c b/mm/nommu.c
index 9cb6e99215e2..b624acec6d2e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,18 +42,11 @@
#include <asm/mmu_context.h>
#include "internal.h"
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-struct page *mem_map;
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
static struct kmem_cache *vm_region_jar;
@@ -207,7 +200,23 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return __vmalloc_noprof(size, gfp_mask);
+}
/*
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -392,8 +401,22 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
return mm->brk = brk;
}
+static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+
+static const struct ctl_table nommu_table[] = {
+ {
+ .procname = "nr_trim_pages",
+ .data = &sysctl_nr_trim_pages,
+ .maxlen = sizeof(sysctl_nr_trim_pages),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
/*
- * initialise the percpu counter for VM and region record slabs
+ * initialise the percpu counter for VM and region record slabs, initialise VMA
+ * state.
*/
void __init mmap_init(void)
{
@@ -402,6 +425,8 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
+ register_sysctl_init("vm", nommu_table);
+ vma_state_init();
}
/*
@@ -620,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
/*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- vma = vma_lookup(mm, addr);
- if (!vma)
- mmap_read_unlock(mm);
- return vma;
-}
-
-/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
@@ -1191,7 +1200,7 @@ share:
setup_vma_to_mm(vma, current->mm);
current->mm->map_count++;
/* add the VMA to the tree */
- vma_iter_store(&vmi, vma);
+ vma_iter_store_new(&vmi, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1356,7 +1365,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
setup_vma_to_mm(vma, mm);
setup_vma_to_mm(new, mm);
- vma_iter_store(vmi, new);
+ vma_iter_store_new(vmi, new);
mm->map_count++;
return 0;
@@ -1701,6 +1710,85 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len)
+{
+ unsigned long addr_end;
+ struct vm_area_struct *vma;
+ int ret = -EFAULT;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return ret;
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ if (check_add_overflow(addr, len, &addr_end))
+ goto out;
+
+ /* don't overrun this mapping */
+ if (addr_end > vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read mappings where it is permitted */
+ if (vma->vm_flags & VM_MAYREAD) {
+ ret = strscpy(buf, (char *)addr, len);
+ if (ret < 0)
+ ret = len - 1;
+ }
+
+out:
+ mmap_read_unlock(mm);
+ return ret;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour (unused)
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
* @inode: The inode to check
@@ -1804,3 +1892,11 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
+
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ mmap_write_lock(oldmm);
+ dup_mm_exe_file(mm, oldmm);
+ mmap_write_unlock(oldmm);
+ return 0;
+}
diff --git a/mm/numa.c b/mm/numa.c
index e2eec07707d1..7d5e06fe5bd4 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -13,7 +13,6 @@ void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
- void *nd;
int tnid;
/* Allocate node data. Try node-local memory and then any node. */
@@ -21,7 +20,6 @@ void __init alloc_node_data(int nid)
if (!nd_pa)
panic("Cannot allocate %zu bytes for node %d data\n",
nd_size, nid);
- nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
@@ -30,20 +28,14 @@ void __init alloc_node_data(int nid)
if (tnid != nid)
pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
- node_data[nid] = nd;
+ node_data[nid] = __va(nd_pa);
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
}
void __init alloc_offline_node_data(int nid)
{
pg_data_t *pgdat;
-
- pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);
- if (!pgdat)
- panic("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
-
- node_data[nid] = pgdat;
+ node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES);
}
/* Stub functions: */
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 031fb9961bf7..9d55679d99ce 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -8,11 +8,12 @@
#include <linux/memblock.h>
#include <linux/numa_memblks.h>
#include <asm/numa.h>
+#include <acpi/acpi_numa.h>
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-static int emu_nid_to_phys[MAX_NUMNODES];
+int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
int __init numa_emu_cmdline(char *str)
@@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
int i, j, ret;
+ nodemask_t physnode_mask = numa_nodes_parsed;
if (!emu_cmdline)
goto no_emu;
@@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* split the system RAM into N fake nodes.
*/
if (strchr(emu_cmdline, 'U')) {
- nodemask_t physnode_mask = numa_nodes_parsed;
unsigned long n;
int nid = 0;
@@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
*/
max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
- /* commit */
- *numa_meminfo = ei;
-
/* Make sure numa_nodes_parsed only contains emulated nodes */
nodes_clear(numa_nodes_parsed);
for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
@@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
ei.blk[i].nid != NUMA_NO_NODE)
node_set(ei.blk[i].nid, numa_nodes_parsed);
- numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys));
+ /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision
+ * with faked numa nodes, particularly during later memory hotplug
+ * handling, and also update numa_nodes_parsed accordingly.
+ */
+ ret = fix_pxm_node_maps(max_emu_nid);
+ if (ret < 0)
+ goto no_emu;
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1);
/* make sure all emulated nodes are mapped to a physical node */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ for (i = 0; i < max_emu_nid + 1; i++)
if (emu_nid_to_phys[i] == NUMA_NO_NODE)
emu_nid_to_phys[i] = dfl_phys_nid;
@@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
numa_set_distance(i, j, dist);
}
}
+ for (i = 0; i < numa_distance_cnt; i++) {
+ for (j = 0; j < numa_distance_cnt; j++) {
+ int physi, physj;
+ u8 dist;
+
+ /* distance between fake nodes is already ok */
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE &&
+ emu_nid_to_phys[j] != NUMA_NO_NODE)
+ continue;
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE)
+ physi = emu_nid_to_phys[i];
+ else
+ physi = i - max_emu_nid;
+ if (emu_nid_to_phys[j] != NUMA_NO_NODE)
+ physj = emu_nid_to_phys[j];
+ else
+ physj = j - max_emu_nid;
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+ numa_set_distance(i, j, dist);
+ }
+ }
/* free the copied physical distance table */
memblock_free(phys_dist, phys_size);
return;
no_emu:
+ numa_nodes_parsed = physnode_mask;
/* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
emu_nid_to_phys[i] = i;
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index a3877e9bc878..541a99c4071a 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -7,7 +7,7 @@
#include <linux/numa.h>
#include <linux/numa_memblks.h>
-static int numa_distance_cnt;
+int numa_distance_cnt;
static u8 *numa_distance;
nodemask_t numa_nodes_parsed __initdata;
@@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
}
/**
+ * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the numa_reserved_meminfo.
+ *
+ * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
+ * against memblock_type information and moves any that intersect reserved
+ * ranges to numa_reserved_meminfo. However, when that information is known
+ * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
+ * to numa_reserved_meminfo directly.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo);
+}
+
+/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..25923cfec9c6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
@@ -557,7 +563,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
}
/*
- * Reaps the address space of the give task.
+ * Reaps the address space of the given task.
*
* Returns true on success and false if none or part of the address space
* has been reclaimed and the caller should retry later.
@@ -699,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk)
}
#ifdef CONFIG_SYSCTL
-static struct ctl_table vm_oom_kill_table[] = {
+static const struct ctl_table vm_oom_kill_table[] = {
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d213ead95675..72b0ff0d4bae 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -41,6 +41,7 @@
#include <trace/events/writeback.h>
#include "internal.h"
+#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -120,29 +121,6 @@ EXPORT_SYMBOL(laptop_mode);
struct wb_domain global_wb_domain;
-/* consolidated parameters for balance_dirty_pages() and its subroutines */
-struct dirty_throttle_control {
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct wb_domain *dom;
- struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
-#endif
- struct bdi_writeback *wb;
- struct fprop_local_percpu *wb_completions;
-
- unsigned long avail; /* dirtyable */
- unsigned long dirty; /* file_dirty + write + nfs */
- unsigned long thresh; /* dirty threshold */
- unsigned long bg_thresh; /* dirty background threshold */
-
- unsigned long wb_dirty; /* per-wb counterparts */
- unsigned long wb_thresh;
- unsigned long wb_bg_thresh;
-
- unsigned long pos_ratio;
- bool freerun;
- bool dirty_exceeded;
-};
-
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
@@ -543,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
@@ -630,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
*/
static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = from_timer(dom, t, period_timer);
+ struct wb_domain *dom = timer_container_of(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -663,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -692,6 +670,8 @@ static unsigned long bdi_ratio_from_pages(unsigned long pages)
unsigned long ratio;
global_dirty_limits(&background_thresh, &dirty_thresh);
+ if (!dirty_thresh)
+ return -EINVAL;
ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);
return ratio;
@@ -790,13 +770,15 @@ int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
int ret;
unsigned long pages = min_bytes >> PAGE_SHIFT;
- unsigned long min_ratio;
+ long min_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
min_ratio = bdi_ratio_from_pages(pages);
+ if (min_ratio < 0)
+ return min_ratio;
return __bdi_set_min_ratio(bdi, min_ratio);
}
@@ -809,13 +791,15 @@ int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
int ret;
unsigned long pages = max_bytes >> PAGE_SHIFT;
- unsigned long max_ratio;
+ long max_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
max_ratio = bdi_ratio_from_pages(pages);
+ if (max_ratio < 0)
+ return max_ratio;
return __bdi_set_max_ratio(bdi, max_ratio);
}
@@ -936,26 +920,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > wb_max_thresh)
- wb_thresh = wb_max_thresh;
/*
- * With strictlimit flag, the wb_thresh is treated as
- * a hard limit in balance_dirty_pages() and wb_position_ratio().
- * It's possible that wb_thresh is close to zero, not because
- * the device is slow, but because it has been inactive.
- * To prevent occasional writes from being blocked, we raise wb_thresh.
+ * It's very possible that wb_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
*/
- if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
- u64 wb_scale_thresh = 0;
-
- if (limit > dtc->dirty)
- wb_scale_thresh = (limit - dtc->dirty) / 100;
- wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4));
+ if (thresh > dtc->dirty) {
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
+ else
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
}
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
+
return wb_thresh;
}
@@ -963,6 +946,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ domain_dirty_avail(&gdtc, true);
return __wb_calc_thresh(&gdtc, thresh);
}
@@ -1089,7 +1073,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
- unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
@@ -1139,12 +1123,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
- if (dtc->wb_dirty < 8) {
- dtc->pos_ratio = min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
- return;
- }
-
if (dtc->wb_dirty >= wb_thresh)
return;
@@ -1216,14 +1194,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
- * It's very possible that wb_thresh is close to 0 not because the
- * device is slow, but that it has remained inactive for long time.
- * Honour such devices a reasonable good (hopefully IO efficient)
- * threshold, so that the occasional writes won't be blocked and active
- * writes can rampup the threshold quickly.
- */
- wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
- /*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
@@ -1478,17 +1448,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
- *
- * We rampup dirty_ratelimit forcibly if wb_dirty is low because
- * it's possible that wb_thresh is close to zero due to inactivity
- * of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
- if (dtc->wb_dirty < 8)
- setpoint = dtc->wb_dirty + 1;
- else
- setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
@@ -1977,11 +1940,7 @@ free_running:
*/
if (pause < min_pause) {
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -2006,11 +1965,7 @@ free_running:
pause:
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -2248,7 +2203,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
- from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+ timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
@@ -2275,7 +2230,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
@@ -2313,7 +2268,7 @@ static int page_writeback_cpu_online(unsigned int cpu)
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-static struct ctl_table vm_page_writeback_sysctls[] = {
+static const struct ctl_table vm_page_writeback_sysctls[] = {
{
.procname = "dirty_background_ratio",
.data = &dirty_background_ratio,
@@ -2610,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
@@ -2667,27 +2622,6 @@ int write_cache_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(write_cache_pages);
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2698,14 +2632,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
@@ -3124,6 +3055,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
int access_ret;
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1cb4b8c8886d..2ef3c07266b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -273,6 +276,7 @@ int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
+int defrag_mode;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -286,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static bool cond_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -508,9 +513,9 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- bool __maybe_unused movable;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
@@ -614,6 +619,10 @@ compaction_capture(struct capture_control *capc, struct page *page,
capc->cc->migratetype != MIGRATE_MOVABLE)
return false;
+ if (migratetype != capc->cc->migratetype)
+ trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
+ capc->cc->migratetype, migratetype);
+
capc->page = page;
return true;
}
@@ -655,16 +664,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
bool tail)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
else
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
/*
@@ -676,24 +689,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), old_mt, 1 << order);
+ get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
- account_freepages(zone, -(1 << order), old_mt);
- account_freepages(zone, 1 << order, new_mt);
+ account_freepages(zone, -nr_pages, old_mt);
+ account_freepages(zone, nr_pages, new_mt);
+
+ if (order >= pageblock_order &&
+ is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
+ if (!is_migrate_isolate(old_mt))
+ nr_pages = -nr_pages;
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
+ }
}
static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
+ int nr_pages = 1 << order;
+
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
if (page_reported(page))
@@ -703,6 +726,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -872,9 +898,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
+ page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -901,26 +925,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
-static void free_page_is_bad_report(struct page *page)
-{
- bad_page(page,
- page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
-}
-
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
- free_page_is_bad_report(page);
+ bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}
@@ -947,21 +963,34 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
- if (unlikely(folio_entire_mapcount(folio))) {
- bad_page(page, "nonzero entire_mapcount");
- goto out;
- }
if (unlikely(folio_large_mapcount(folio))) {
bad_page(page, "nonzero large_mapcount");
goto out;
}
- if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
+ unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
- if (unlikely(atomic_read(&folio->_pincount))) {
- bad_page(page, "nonzero pincount");
- goto out;
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+ bad_page(page, "nonzero mm mapcount 0");
+ goto out;
+ }
+ if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+ bad_page(page, "nonzero mm mapcount 1");
+ goto out;
+ }
+ }
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
}
break;
case 2:
@@ -970,7 +999,22 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "on deferred list");
goto out;
}
+ if (!IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
+ }
break;
+ case 3:
+ /* the third tail page: hugetlb specifics overlap ->mappings */
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
+ break;
+ fallthrough;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
@@ -1041,6 +1085,79 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_sub(page, nr);
+}
+
+/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+{
+ if (tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
@@ -1096,8 +1213,12 @@ __always_inline bool free_pages_prepare(struct page *page,
if (unlikely(order)) {
int i;
- if (compound)
+ if (compound) {
page[1].flags &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
+ }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1238,22 +1359,56 @@ static void split_large_buddy(struct zone *zone, struct page *page,
if (order > pageblock_order)
order = pageblock_order;
- while (pfn != end) {
+ do {
int mt = get_pfnblock_migratetype(page, pfn);
__free_one_page(page, pfn, zone, order, mt, fpi);
pfn += 1 << order;
+ if (pfn == end)
+ break;
page = pfn_to_page(pfn);
- }
+ } while (1);
+}
+
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->order = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->order;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1293,12 +1448,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order,
set_page_count(p, 0);
}
- /*
- * Freeing the page with debug_pagealloc enabled will try to
- * unmap it; some archs don't like double-unmappings, so
- * map it first.
- */
- debug_pagealloc_map_pages(page, nr_pages);
adjust_managed_page_count(page, nr_pages);
} else {
for (loop = 0; loop < nr_pages; loop++, p++) {
@@ -1431,7 +1580,7 @@ static __always_inline void page_del_and_expand(struct zone *zone,
static void check_new_page_bad(struct page *page)
{
- if (unlikely(page->flags & __PG_HWPOISON)) {
+ if (unlikely(PageHWPoison(page))) {
/* Don't complain about hwpoisoned pages */
if (PageBuddy(page))
__ClearPageBuddy(page);
@@ -1506,7 +1655,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
int i;
set_page_private(page, 0);
- set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
@@ -1830,39 +1978,6 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}
-/*
- * When we are falling back to another migratetype during allocation, try to
- * steal extra free pages from the same pageblocks to satisfy further
- * allocations, instead of polluting multiple pageblocks.
- *
- * If we are stealing a relatively large buddy page, it is likely there will
- * be more free pages in the pageblock, so try to steal them all. For
- * reclaimable and unmovable allocations, we steal regardless of page size,
- * as fragmentation caused by those allocations polluting movable pageblocks
- * is worse than movable allocations stealing from unmovable and reclaimable
- * pageblocks.
- */
-static bool can_steal_fallback(unsigned int order, int start_mt)
-{
- /*
- * Leaving this order check is intended, although there is
- * relaxed order check in next check. The reason is that
- * we can actually steal whole pageblock if this condition met,
- * but, below check doesn't guarantee it and that is just heuristic
- * so could be changed anytime.
- */
- if (order >= pageblock_order)
- return true;
-
- if (order >= pageblock_order / 2 ||
- start_mt == MIGRATE_RECLAIMABLE ||
- start_mt == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled)
- return true;
-
- return false;
-}
-
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
@@ -1901,30 +2016,93 @@ static inline bool boost_watermark(struct zone *zone)
}
/*
- * This function implements actual steal behaviour. If order is large enough, we
- * can claim the whole pageblock for the requested migratetype. If not, we check
- * the pageblock for constituent pages; if at least half of the pages are free
- * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * When we are falling back to another migratetype during allocation, should we
+ * try to claim an entire block to satisfy further allocations, instead of
+ * polluting multiple pageblocks?
*/
-static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
- int current_order, int order, int start_type,
- unsigned int alloc_flags, bool whole_block)
+static bool should_try_claim_block(unsigned int order, int start_mt)
{
- int free_pages, movable_pages, alike_pages;
- unsigned long start_pfn;
- int block_type;
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually claim the whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ /*
+ * Above a certain threshold, always try to claim, as it's likely there
+ * will be more free pages in the pageblock.
+ */
+ if (order >= pageblock_order / 2)
+ return true;
+
+ /*
+ * Unmovable/reclaimable allocations would cause permanent
+ * fragmentations if they fell back to allocating from a movable block
+ * (polluting it), so we try to claim the whole block regardless of the
+ * allocation size. Later movable allocations can always steal from this
+ * block, which is less problematic.
+ */
+ if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
+ return true;
- block_type = get_pageblock_migratetype(page);
+ if (page_group_by_mobility_disabled)
+ return true;
/*
- * This can happen due to races and we want to prevent broken
- * highatomic accounting.
+ * Movable pages won't cause permanent fragmentation, so when you alloc
+ * small pages, we just need to temporarily steal unmovable or
+ * reclaimable pages that are closest to the request size. After a
+ * while, memory compaction may occur to form large contiguous pages,
+ * and the next movable allocation may not need to steal.
*/
- if (is_migrate_highatomic(block_type))
- goto single_page;
+ return false;
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If claimable is true, this function returns fallback_mt only if
+ * we would do this whole-block claiming. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool claimable)
+{
+ int i;
+
+ if (claimable && !should_try_claim_block(order, migratetype))
+ return -2;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
+ int fallback_mt = fallbacks[migratetype][i];
+
+ if (!free_area_empty(area, fallback_mt))
+ return fallback_mt;
+ }
+
+ return -1;
+}
+
+/*
+ * This function implements actual block claiming behaviour. If order is large
+ * enough, we can claim the whole pageblock for the requested migratetype. If
+ * not, we check the pageblock for constituent pages; if at least half of the
+ * pages are free or compatible, we can still claim the whole block, so pages
+ * freed in the future will be put on the correct free list.
+ */
+static struct page *
+try_to_claim_block(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ int block_type, unsigned int alloc_flags)
+{
+ int free_pages, movable_pages, alike_pages;
+ unsigned long start_pfn;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1945,14 +2123,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
- /* We are not allowed to try stealing from the whole block */
- if (!whole_block)
- goto single_page;
-
/* moving whole block can fail due to zone boundary conditions */
if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
&movable_pages))
- goto single_page;
+ return NULL;
/*
* Determine how many pages are compatible with our allocation.
@@ -1985,201 +2159,19 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
return __rmqueue_smallest(zone, order, start_type);
}
-single_page:
- page_del_and_expand(zone, page, order, current_order, block_type);
- return page;
-}
-
-/*
- * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
- * fragmentation due to mixed migratetype pages in one pageblock.
- */
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal)
-{
- int i;
- int fallback_mt;
-
- if (area->nr_free == 0)
- return -1;
-
- *can_steal = false;
- for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
-
- if (can_steal_fallback(order, migratetype))
- *can_steal = true;
-
- if (!only_stealable)
- return fallback_mt;
-
- if (*can_steal)
- return fallback_mt;
- }
-
- return -1;
-}
-
-/*
- * Reserve the pageblock(s) surrounding an allocation request for
- * exclusive use of high-order atomic allocations if there are no
- * empty page blocks that contain a page with a suitable order
- */
-static void reserve_highatomic_pageblock(struct page *page, int order,
- struct zone *zone)
-{
- int mt;
- unsigned long max_managed, flags;
-
- /*
- * The number reserved as: minimum is 1 pageblock, maximum is
- * roughly 1% of a zone. But if 1% of a zone falls below a
- * pageblock size, then don't reserve any pageblocks.
- * Check is race-prone but harmless.
- */
- if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
- return;
- max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
- if (zone->nr_reserved_highatomic >= max_managed)
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- /* Recheck the nr_reserved_highatomic limit under the lock */
- if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
-
- /* Yoink! */
- mt = get_pageblock_migratetype(page);
- /* Only reserve normal pageblocks (i.e., they can merge with others) */
- if (!migratetype_is_mergeable(mt))
- goto out_unlock;
-
- if (order < pageblock_order) {
- if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
- zone->nr_reserved_highatomic += pageblock_nr_pages;
- } else {
- change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
- zone->nr_reserved_highatomic += 1 << order;
- }
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
-}
-
-/*
- * Used when an allocation is about to fail under memory pressure. This
- * potentially hurts the reliability of high-order allocations when under
- * intense memory pressure but failed atomic allocations should be easier
- * to recover from than an OOM.
- *
- * If @force is true, try to unreserve pageblocks even though highatomic
- * pageblock is exhausted.
- */
-static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
- bool force)
-{
- struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
- struct zoneref *z;
- struct zone *zone;
- struct page *page;
- int order;
- int ret;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
- ac->nodemask) {
- /*
- * Preserve at least one pageblock unless memory pressure
- * is really high.
- */
- if (!force && zone->nr_reserved_highatomic <=
- pageblock_nr_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct free_area *area = &(zone->free_area[order]);
- int mt;
-
- page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
- if (!page)
- continue;
-
- mt = get_pageblock_migratetype(page);
- /*
- * In page freeing path, migratetype change is racy so
- * we can counter several free pages in a pageblock
- * in this loop although we changed the pageblock type
- * from highatomic to ac->migratetype. So we should
- * adjust the count once.
- */
- if (is_migrate_highatomic(mt)) {
- unsigned long size;
- /*
- * It should never happen but changes to
- * locking could inadvertently allow a per-cpu
- * drain to add pages to MIGRATE_HIGHATOMIC
- * while unreserving so be safe and watch for
- * underflows.
- */
- size = max(pageblock_nr_pages, 1UL << order);
- size = min(size, zone->nr_reserved_highatomic);
- zone->nr_reserved_highatomic -= size;
- }
-
- /*
- * Convert to ac->migratetype and avoid the normal
- * pageblock stealing heuristics. Minimally, the caller
- * is doing the work and needs the pages. More
- * importantly, if the block was always converted to
- * MIGRATE_UNMOVABLE or another type then the number
- * of pageblocks that cannot be completely freed
- * may increase.
- */
- if (order < pageblock_order)
- ret = move_freepages_block(zone, page, mt,
- ac->migratetype);
- else {
- move_to_free_list(page, zone, order, mt,
- ac->migratetype);
- change_pageblock_range(page, order,
- ac->migratetype);
- ret = 1;
- }
- /*
- * Reserving the block(s) already succeeded,
- * so this should not fail on zone boundaries.
- */
- WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-
- return false;
+ return NULL;
}
/*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2187,7 +2179,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2206,62 +2197,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, true);
+
+ /* No block in that order */
if (fallback_mt == -1)
continue;
- /*
- * We cannot steal all free pages from the pageblock and the
- * requested migratetype is movable. In that case it's better to
- * steal and split the smallest available page instead of the
- * largest available page, because even if the next movable
- * allocation falls back into a different pageblock than this
- * one, it won't cause permanent fragmentation.
- */
- if (!can_steal && start_migratetype == MIGRATE_MOVABLE
- && current_order > order)
- goto find_smallest;
+ /* Advanced into orders too low to claim, abort */
+ if (fallback_mt == -2)
+ break;
- goto do_steal;
+ page = get_page_from_free_area(area, fallback_mt);
+ page = try_to_claim_block(zone, page, current_order, order,
+ start_migratetype, fallback_mt,
+ alloc_flags);
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
-find_smallest:
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
- if (fallback_mt != -1)
- break;
- }
-
- /*
- * This should not happen - we already found a suitable fallback
- * when looking for the largest page.
- */
- VM_BUG_ON(current_order > MAX_PAGE_ORDER);
-
-do_steal:
- page = get_page_from_free_area(area, fallback_mt);
-
- /* take off list, maybe claim block, expand remainder */
- page = steal_suitable_fallback(zone, page, current_order, order,
- start_migratetype, alloc_flags, can_steal);
+ start_migratetype, false);
+ if (fallback_mt == -1)
+ continue;
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ page = get_page_from_free_area(area, fallback_mt);
+ page_del_and_expand(zone, page, order, current_order, fallback_mt);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
- return page;
+ return NULL;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2280,16 +2282,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2301,13 +2335,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
+ return 0;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2590,9 +2630,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
- struct page *page, int migratetype,
- unsigned int order)
+static void free_frozen_page_commit(struct zone *zone,
+ struct per_cpu_pages *pcp, struct page *page, int migratetype,
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@@ -2617,16 +2657,24 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
- free_high = (pcp->free_count >= batch &&
+ free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(batch)));
+ pcp->count >= batch));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return;
+ }
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2641,7 +2689,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
/*
* Free a pcp page
*/
-void free_unref_page(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2650,7 +2699,7 @@ void free_unref_page(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2664,27 +2713,37 @@ void free_unref_page(struct page *page, unsigned int order)
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
+ zone = page_zone(page);
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- zone = page_zone(page);
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
+}
+
/*
* Free a batch of folios
*/
@@ -2741,7 +2800,7 @@ void free_unref_folios(struct folio_batch *folios)
/*
* Free isolated pages directly to the
- * allocator, see comment in free_unref_page.
+ * allocator, see comment in free_frozen_pages.
*/
if (is_migrate_isolate(migratetype)) {
free_one_page(zone, &folio->page, pfn,
@@ -2772,8 +2831,8 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_unref_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
+ order, FPI_NONE);
}
if (pcp) {
@@ -2802,7 +2861,7 @@ void split_page(struct page *page, unsigned int order)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2904,11 +2963,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
+ return NULL;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
@@ -3092,6 +3158,142 @@ out:
return page;
}
+/*
+ * Reserve the pageblock(s) surrounding an allocation request for
+ * exclusive use of high-order atomic allocations if there are no
+ * empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, int order,
+ struct zone *zone)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * The number reserved as: minimum is 1 pageblock, maximum is
+ * roughly 1% of a zone. But if 1% of a zone falls below a
+ * pageblock size, then don't reserve any pageblocks.
+ * Check is race-prone but harmless.
+ */
+ if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+ return;
+ max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (!migratetype_is_mergeable(mt))
+ goto out_unlock;
+
+ if (order < pageblock_order) {
+ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
+ goto out_unlock;
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ } else {
+ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
+ zone->nr_reserved_highatomic += 1 << order;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve pageblocks even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+ int ret;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ ac->nodemask) {
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+ unsigned long size;
+
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ if (!page)
+ continue;
+
+ size = max(pageblock_nr_pages, 1UL << order);
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
+ size = zone->nr_reserved_highatomic;
+ zone->nr_reserved_highatomic -= size;
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ if (order < pageblock_order)
+ ret = move_freepages_block(zone, page,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ else {
+ move_to_free_list(page, zone, order,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ change_pageblock_range(page, order,
+ ac->migratetype);
+ ret = 1;
+ }
+ /*
+ * Reserving the block(s) already succeeded,
+ * so this should not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(ret == -1);
+ if (ret > 0) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return false;
+}
+
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
@@ -3249,18 +3451,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -3295,6 +3485,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
+ if (defrag_mode) {
+ alloc_flags |= ALLOC_NOFRAGMENT;
+ return alloc_flags;
+ }
+
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
@@ -3344,7 +3539,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+ * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -3386,7 +3581,7 @@ retry:
continue;
}
- if (no_fallback && nr_online_nodes > 1 &&
+ if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
zone != zonelist_zone(ac->preferred_zoneref)) {
int local_nid;
@@ -3402,7 +3597,7 @@ retry:
}
}
- cond_accept_memory(zone, order);
+ cond_accept_memory(zone, order, alloc_flags);
/*
* Detect whether the number of free pages is below high
@@ -3429,7 +3624,7 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/*
@@ -3482,7 +3677,7 @@ try_this_zone:
return page;
} else {
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/* Try again if zone has deferred pages */
@@ -3497,7 +3692,7 @@ try_this_zone:
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
- if (no_fallback) {
+ if (no_fallback && !defrag_mode) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
@@ -3565,7 +3760,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);
-
return page;
}
@@ -3977,15 +4171,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
+ unsigned int reclaim_order;
+
+ if (defrag_mode)
+ reclaim_order = max(order, pageblock_order);
+ else
+ reclaim_order = order;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (!managed_zone(zone))
continue;
- if (last_pgdat != zone->zone_pgdat) {
- wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
- }
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
+ last_pgdat = zone->zone_pgdat;
}
}
@@ -4026,7 +4226,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
/*
* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
* GFP_ATOMIC) rather than fail, see the comment for
- * cpuset_node_allowed().
+ * cpuset_current_node_allowed().
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
@@ -4035,6 +4235,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+ if (defrag_mode)
+ alloc_flags |= ALLOC_NOFRAGMENT;
+
return alloc_flags;
}
@@ -4241,6 +4444,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -4343,6 +4547,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4416,6 +4628,11 @@ retry:
&compaction_retries))
goto retry;
+ /* Reclaim/compaction failed to prevent the fallback */
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
/*
* Deal with possible cpuset update races or zonelist updates to avoid
@@ -4509,7 +4726,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4529,28 +4751,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
* @gfp: GFP flags for the allocation
* @preferred_nid: The preferred NUMA node ID to allocate from
* @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list or array
- * @page_list: Optional list to store the allocated pages
- * @page_array: Optional array to store the pages
+ * @nr_pages: The number of pages desired in the array
+ * @page_array: Array to store the pages
*
* This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to page_list if page_list
- * is not NULL, otherwise it is assumed that the page_array is valid.
- *
- * For lists, nr_pages is the number of pages that should be allocated.
+ * allocate nr_pages quickly. Pages are added to the page_array.
*
- * For arrays, only NULL elements are populated with pages and nr_pages
+ * Note that only NULL elements are populated with pages and nr_pages
* is the maximum number of pages that will be stored in the array.
*
- * Returns the number of pages on the list or array.
+ * Returns the number of pages in the array.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array)
{
struct page *page;
@@ -4568,7 +4785,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ while (nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
/* No pages requested? */
@@ -4576,7 +4793,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto out;
/* Already populated array? */
- if (unlikely(page_array && nr_pages - nr_populated == 0))
+ if (unlikely(nr_pages - nr_populated == 0))
goto out;
/* Bulk allocator does not support memcg accounting. */
@@ -4621,7 +4838,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto failed;
}
- cond_accept_memory(zone, 0);
+ cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
@@ -4630,7 +4847,7 @@ retry_this_zone:
break;
}
- if (cond_accept_memory(zone, 0))
+ if (cond_accept_memory(zone, 0, alloc_flags))
goto retry_this_zone;
/* Try again if zone has deferred pages */
@@ -4658,7 +4875,7 @@ retry_this_zone:
while (nr_populated < nr_pages) {
/* Skip existing pages */
- if (page_array && page_array[nr_populated]) {
+ if (page_array[nr_populated]) {
nr_populated++;
continue;
}
@@ -4676,11 +4893,8 @@ retry_this_zone:
nr_account++;
prep_new_page(page, 0, gfp, 0);
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
+ set_page_refcounted(page);
+ page_array[nr_populated++] = page;
}
pcp_spin_unlock(pcp);
@@ -4697,14 +4911,8 @@ failed_irq:
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- if (page) {
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
- }
-
+ if (page)
+ page_array[nr_populated++] = page;
goto out;
}
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
@@ -4712,8 +4920,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
- int preferred_nid, nodemask_t *nodemask)
+struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4766,7 +4974,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
- __free_pages(page, order);
+ free_frozen_pages(page, order);
page = NULL;
}
@@ -4775,6 +4983,18 @@ out:
return page;
}
+EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
+
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
+{
+ struct page *page;
+
+ page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
EXPORT_SYMBOL(__alloc_pages_noprof);
struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
@@ -4809,9 +5029,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
- * __free_pages - Free pages allocated with alloc_pages().
+ * ___free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
+ * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -4828,22 +5049,38 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-void __free_pages(struct page *page, unsigned int order)
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
- free_unref_page(page, order);
+ __free_frozen_pages(page, order, fpi_flags);
else if (!head) {
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
- free_unref_page(page + (1 << order), order);
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
}
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_NONE);
+}
EXPORT_SYMBOL(__free_pages);
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from alloc_pages_nolock)
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_TRYLOCK);
+}
+
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -4864,7 +5101,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
while (page < --last)
set_page_refcounted(last);
@@ -5159,13 +5396,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
zonerefs->zone_idx = 0;
}
-/*
- * Build zonelists ordered by zone and nodes within zones.
- * This results in conserving DMA zone[s] until all Normal memory is
- * exhausted, but results in overflowing to remote node while memory
- * may still exist in local DMA zone.
- */
-
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
@@ -5690,10 +5920,13 @@ __meminit void zone_pcp_init(struct zone *zone)
zone->present_pages, zone_batchsize(zone));
}
+static void setup_per_zone_lowmem_reserve(void);
+
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
totalram_pages_add(count);
+ setup_per_zone_lowmem_reserve();
}
EXPORT_SYMBOL(adjust_managed_page_count);
@@ -5831,6 +6064,7 @@ static void calculate_totalreserve_pages(void)
}
}
totalreserve_pages = reserve_pages;
+ trace_mm_calculate_totalreserve_pages(totalreserve_pages);
}
/*
@@ -5853,14 +6087,15 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
+ trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
+ zone->lowmem_reserve[j]);
}
}
}
@@ -5924,6 +6159,7 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
+ trace_mm_setup_per_zone_wmarks(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6170,7 +6406,7 @@ out:
return ret;
}
-static struct ctl_table page_alloc_sysctl_table[] = {
+static const struct ctl_table page_alloc_sysctl_table[] = {
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -6197,6 +6433,15 @@ static struct ctl_table page_alloc_sysctl_table[] = {
.extra2 = SYSCTL_THREE_THOUSAND,
},
{
+ .procname = "defrag_mode",
+ .data = &defrag_mode,
+ .maxlen = sizeof(defrag_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
@@ -6265,9 +6510,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
* @migratetype: using migratetype to filter the type of migration in
* trace_mm_alloc_contig_migrate_range_info.
*/
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end, int migratetype)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6276,7 +6520,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
int ret = 0;
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
struct page *page;
@@ -6346,7 +6590,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
return (ret < 0) ? ret : 0;
}
-static void split_free_pages(struct list_head *list)
+static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
{
int order;
@@ -6357,7 +6601,8 @@ static void split_free_pages(struct list_head *list)
list_for_each_entry_safe(page, next, &list[order], lru) {
int i;
- post_alloc_hook(page, order, __GFP_MOVABLE);
+ post_alloc_hook(page, order, gfp_mask);
+ set_page_refcounted(page);
if (!order)
continue;
@@ -6371,6 +6616,40 @@ static void split_free_pages(struct list_head *list)
}
}
+static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
+{
+ const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO;
+ const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+
+ /*
+ * We are given the range to allocate; node, mobility and placement
+ * hints are irrelevant at this point. We'll simply ignore them.
+ */
+ gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE |
+ __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE);
+
+ /*
+ * We only support most reclaim flags (but not NOFAIL/NORETRY), and
+ * selected action flags.
+ */
+ if (gfp_mask & ~(reclaim_mask | action_mask))
+ return -EINVAL;
+
+ /*
+ * Flags to control page compaction/migration/reclaim, to free up our
+ * page range. Migratable pages are movable, __GFP_MOVABLE is implied
+ * for them.
+ *
+ * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
+ * to not degrade callers.
+ */
+ *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) |
+ __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+ return 0;
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -6379,7 +6658,9 @@ static void split_free_pages(struct list_head *list)
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
- * @gfp_mask: GFP mask to use during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
*
* The PFN range does not have to be pageblock aligned. The PFN range must
* belong to a single zone.
@@ -6405,11 +6686,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
- .gfp_mask = current_gfp_context(gfp_mask),
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ gfp_mask = current_gfp_context(gfp_mask);
+ if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
+ return -EINVAL;
+
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -6431,7 +6715,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
+ ret = start_isolate_page_range(start, end, migratetype, 0);
if (ret)
goto done;
@@ -6450,7 +6734,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
if (ret && ret != -EBUSY)
goto done;
- ret = 0;
+
+ /*
+ * When in-use hugetlb pages are migrated, they may simply be released
+ * back into the free hugepage pool instead of being returned to the
+ * buddy system. After the migration of in-use huge pages is completed,
+ * we will invoke replace_free_hugepage_folios() to ensure that these
+ * hugepages are properly released to the buddy system.
+ */
+ ret = replace_free_hugepage_folios(start, end);
+ if (ret)
+ goto done;
/*
* Pages from [start, end) are within a pageblock_nr_pages
@@ -6484,7 +6778,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
}
if (!(gfp_mask & __GFP_COMP)) {
- split_free_pages(cc.freepages);
+ split_free_pages(cc.freepages, gfp_mask);
/* Free head and tail (if any) */
if (start != outer_start)
@@ -6497,6 +6791,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
+ set_page_refcounted(head);
} else {
ret = -EINVAL;
WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
@@ -6551,7 +6846,9 @@ static bool zone_spans_last_pfn(const struct zone *zone,
/**
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
* @nr_pages: Number of contiguous pages to allocate
- * @gfp_mask: GFP mask to limit search and used during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
@@ -6868,9 +7165,6 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
@@ -6897,11 +7191,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -6910,9 +7200,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
}
void accept_page(struct page *page)
@@ -6949,24 +7236,31 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static inline bool has_unaccepted_memory(void)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
-{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
+ if (list_empty(&zone->unaccepted_pages))
return false;
- if (list_empty(&zone->unaccepted_pages))
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
@@ -6985,22 +7279,17 @@ static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
@@ -7011,7 +7300,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7023,3 +7313,93 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+/**
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
+ */
+struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that alloc_pages_nolock()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
+ | __GFP_ACCOUNT;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (page)
+ set_page_refcounted(page);
+
+ if (memcg_kmem_online() && page &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ free_pages_nolock(page, order);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b249d15af9dd..661e0f2a5127 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -121,6 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
{
struct page_counter *c;
bool protection = track_protection(counter);
+ bool track_failcnt = counter->track_failcnt;
for (c = counter; c; c = c->parent) {
long new;
@@ -146,7 +147,8 @@ bool page_counter_try_charge(struct page_counter *counter,
* inaccuracy in the failcnt which is only used
* to report stats.
*/
- data_race(c->failcnt++);
+ if (track_failcnt)
+ data_race(c->failcnt++);
*fail = c;
goto failed;
}
@@ -288,7 +290,7 @@ int page_counter_memparse(const char *buf, const char *max,
}
-#ifdef CONFIG_MEMCG
+#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
* This function calculates an individual page counter's effective
* protection which is derived from its own memory.min/low, its
@@ -460,4 +462,4 @@ void page_counter_calculate_protection(struct page_counter *root,
atomic_long_read(&parent->children_low_usage),
recursive_protection));
}
-#endif /* CONFIG_MEMCG */
+#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 641d93f6af4c..c351fdfe9e9a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -508,6 +508,19 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
#endif
/**
+ * page_ext_lookup() - Lookup a page extension for a PFN.
+ * @pfn: PFN of the page we're interested in.
+ *
+ * Must be called with RCU read lock taken and @pfn must be valid.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ */
+struct page_ext *page_ext_lookup(unsigned long pfn)
+{
+ return lookup_page_ext(pfn_to_page(pfn));
+}
+
+/**
* page_ext_get() - Get the extended information for a page.
* @page: The page we're interested in.
*
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 3f7a203d35c6..d2423f30577e 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
- free_unref_page(page, compound_order(page));
+ free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -138,7 +138,7 @@ refill:
goto refill;
if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
- free_unref_page(page,
+ free_frozen_pages(page,
encoded_page_decode_order(encoded_page));
goto refill;
}
@@ -166,6 +166,6 @@ void page_frag_free(void *addr)
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- free_unref_page(page, compound_order(page));
+ free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 41ea77f22011..408aaf29a3ea 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -62,9 +62,14 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
/*
* For PTE-mapped THP, one sub page is referenced,
* the whole THP is referenced.
+ *
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
*/
- if (ptep_clear_young_notify(vma, addr, pvmw.pte))
- referenced = true;
+ if (likely(pte_present(ptep_get(pvmw.pte))))
+ referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte);
+ referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
referenced = true;
@@ -112,7 +117,7 @@ static void page_idle_clear_pte_refs(struct folio *folio)
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
@@ -157,7 +162,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
}
static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
@@ -193,17 +198,17 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
return (char *)in - buf;
}
-static struct bin_attribute page_idle_bitmap_attr =
+static const struct bin_attribute page_idle_bitmap_attr =
__BIN_ATTR(bitmap, 0600,
page_idle_bitmap_read, page_idle_bitmap_write, 0);
-static struct bin_attribute *page_idle_bin_attrs[] = {
+static const struct bin_attribute *const page_idle_bin_attrs[] = {
&page_idle_bitmap_attr,
NULL,
};
static const struct attribute_group page_idle_attr_group = {
- .bin_attrs = page_idle_bin_attrs,
+ .bin_attrs_new = page_idle_bin_attrs,
.name = "page_idle",
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 4b4ea8e49cf6..f7716b6569fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -163,7 +163,6 @@ reprobe:
page_no = 1; /* force Empty message */
sis->max = page_no;
sis->pages = page_no - 1;
- sis->highest_bit = page_no - 1;
out:
return ret;
bad_bmap:
@@ -238,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
int ret;
if (folio_free_swap(folio)) {
@@ -639,11 +637,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
if (swap_read_folio_zeromap(folio)) {
folio_unlock(folio);
goto finish;
- } else if (zswap_load(folio)) {
- folio_unlock(folio);
- goto finish;
}
+ if (zswap_load(folio) != -ENOENT)
+ goto finish;
+
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7e04047977cf..b2fc5266e3d2 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(folio_hstate(folio)))
+ struct hstate *h;
+
+ /*
+ * The huge page may be freed so can not
+ * use folio_hstate() directly.
+ */
+ h = size_to_hstate(folio_size(folio));
+ if (h && !hugepage_migration_supported(h))
return page;
} else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
return page;
@@ -286,7 +293,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* within a free or in-use page.
* @boundary_pfn: pageblock-aligned pfn that a page might cross
* @flags: isolation flags
- * @gfp_flags: GFP flags used for migrating pages
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
@@ -306,8 +312,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
- int migratetype)
+ bool isolate_before, bool skip_isolation, int migratetype)
{
unsigned long start_pfn;
unsigned long isolate_pageblock;
@@ -444,8 +449,6 @@ failed:
* and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
- * @gfp_flags: GFP flags used for migrating pages that sit across the
- * range boundaries.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -478,7 +481,7 @@ failed:
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags, gfp_t gfp_flags)
+ int migratetype, int flags)
{
unsigned long pfn;
struct page *page;
@@ -489,7 +492,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ ret = isolate_single_pageblock(isolate_start, flags, false,
skip_isolation, migratetype);
if (ret)
return ret;
@@ -498,7 +501,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ ret = isolate_single_pageblock(isolate_end, flags, true,
skip_isolation, migratetype);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
@@ -612,6 +615,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 2d6360eaccbb..9928c9ac8c31 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -229,17 +229,19 @@ static void dec_stack_record_count(depot_stack_handle_t handle,
handle);
}
-static inline void __update_page_owner_handle(struct page_ext *page_ext,
+static inline void __update_page_owner_handle(struct page *page,
depot_stack_handle_t handle,
unsigned short order,
gfp_t gfp_mask,
short last_migrate_reason, u64 ts_nsec,
pid_t pid, pid_t tgid, char *comm)
{
- int i;
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- for (i = 0; i < (1 << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
page_owner->handle = handle;
page_owner->order = order;
@@ -252,20 +254,22 @@ static inline void __update_page_owner_handle(struct page_ext *page_ext,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
- page_ext = page_ext_next(page_ext);
}
+ rcu_read_unlock();
}
-static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
+static inline void __update_page_owner_free_handle(struct page *page,
depot_stack_handle_t handle,
unsigned short order,
pid_t pid, pid_t tgid,
u64 free_ts_nsec)
{
- int i;
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- for (i = 0; i < (1 << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
/* Only __reset_page_owner() wants to clear the bit */
if (handle) {
@@ -275,8 +279,8 @@ static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
page_owner->free_ts_nsec = free_ts_nsec;
page_owner->free_pid = current->pid;
page_owner->free_tgid = current->tgid;
- page_ext = page_ext_next(page_ext);
}
+ rcu_read_unlock();
}
void __reset_page_owner(struct page *page, unsigned short order)
@@ -293,11 +297,17 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner = get_page_owner(page_ext);
alloc_handle = page_owner->handle;
+ page_ext_put(page_ext);
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
- __update_page_owner_free_handle(page_ext, handle, order, current->pid,
+ /*
+ * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
+ * to prevent issues in stack_depot_save().
+ * This is similar to alloc_pages_nolock() gfp flags, but only used
+ * to signal stack_depot to avoid spin_locks.
+ */
+ handle = save_stack(__GFP_NOWARN);
+ __update_page_owner_free_handle(page, handle, order, current->pid,
current->tgid, free_ts_nsec);
- page_ext_put(page_ext);
if (alloc_handle != early_handle)
/*
@@ -313,19 +323,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext;
u64 ts_nsec = local_clock();
depot_stack_handle_t handle;
handle = save_stack(gfp_mask);
-
- page_ext = page_ext_get(page);
- if (unlikely(!page_ext))
- return;
- __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
+ __update_page_owner_handle(page, handle, order, gfp_mask, -1,
ts_nsec, current->pid, current->tgid,
current->comm);
- page_ext_put(page_ext);
inc_stack_record_count(handle, gfp_mask, 1 << order);
}
@@ -344,44 +348,42 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
void __split_page_owner(struct page *page, int old_order, int new_order)
{
- int i;
- struct page_ext *page_ext = page_ext_get(page);
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- if (unlikely(!page_ext))
- return;
-
- for (i = 0; i < (1 << old_order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << old_order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
page_owner->order = new_order;
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- int i;
- struct page_ext *old_ext;
- struct page_ext *new_ext;
+ struct page_ext *page_ext;
+ struct page_ext_iter iter;
struct page_owner *old_page_owner;
struct page_owner *new_page_owner;
depot_stack_handle_t migrate_handle;
- old_ext = page_ext_get(&old->page);
- if (unlikely(!old_ext))
+ page_ext = page_ext_get(&old->page);
+ if (unlikely(!page_ext))
return;
- new_ext = page_ext_get(&newfolio->page);
- if (unlikely(!new_ext)) {
- page_ext_put(old_ext);
+ old_page_owner = get_page_owner(page_ext);
+ page_ext_put(page_ext);
+
+ page_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!page_ext))
return;
- }
- old_page_owner = get_page_owner(old_ext);
- new_page_owner = get_page_owner(new_ext);
+ new_page_owner = get_page_owner(page_ext);
+ page_ext_put(page_ext);
+
migrate_handle = new_page_owner->handle;
- __update_page_owner_handle(new_ext, old_page_owner->handle,
+ __update_page_owner_handle(&newfolio->page, old_page_owner->handle,
old_page_owner->order, old_page_owner->gfp_mask,
old_page_owner->last_migrate_reason,
old_page_owner->ts_nsec, old_page_owner->pid,
@@ -391,7 +393,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
* will be freed after migration. Keep them until then as they may be
* useful.
*/
- __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
+ __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order,
old_page_owner->free_pid,
old_page_owner->free_tgid,
old_page_owner->free_ts_nsec);
@@ -400,14 +402,12 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
* for the new one and the old folio otherwise there will be an imbalance
* when subtracting those pages from the stack.
*/
- for (i = 0; i < (1 << new_page_owner->order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) {
+ old_page_owner = get_page_owner(page_ext);
old_page_owner->handle = migrate_handle;
- old_ext = page_ext_next(old_ext);
- old_page_owner = get_page_owner(old_ext);
}
-
- page_ext_put(new_ext);
- page_ext_put(old_ext);
+ rcu_read_unlock();
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -507,7 +507,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
rcu_read_lock();
memcg_data = READ_ONCE(page->memcg_data);
- if (!memcg_data)
+ if (!memcg_data || PageTail(page))
goto out_unlock;
if (memcg_data & MEMCG_DATA_OBJEXTS)
@@ -813,7 +813,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
goto ext_put_continue;
/* Found early allocated page */
- __update_page_owner_handle(page_ext, early_handle, 0, 0,
+ __update_page_owner_handle(page, early_handle, 0, 0,
-1, local_clock(), current->pid,
current->tgid, current->comm);
count++;
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 509c6ef8de40..4eeca782b888 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -62,24 +62,20 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
*/
static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
struct page *page;
- unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
BUG_ON(PageSlab(page));
anon = PageAnon(page);
- for (i = 0; i < pgcnt; i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, pgcnt, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
@@ -89,9 +85,8 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
}
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
/*
@@ -102,24 +97,20 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
bool rw)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
struct page *page;
- unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
BUG_ON(PageSlab(page));
anon = PageAnon(page);
- for (i = 0; i < pgcnt; i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, pgcnt, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
@@ -129,9 +120,8 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
}
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
/*
@@ -140,24 +130,19 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
- unsigned long i;
BUG_ON(PageSlab(page));
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
- for (i = 0; i < (1ul << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_read(&ptc->file_map_count));
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
@@ -196,9 +181,8 @@ EXPORT_SYMBOL(__page_table_check_pud_clear);
/* Whether the swap entry cached writable information */
static inline bool swap_cached_writable(swp_entry_t entry)
{
- return is_writable_device_exclusive_entry(entry) ||
- is_writable_device_private_entry(entry) ||
- is_writable_migration_entry(entry);
+ return is_writable_device_private_entry(entry) ||
+ is_writable_migration_entry(entry);
}
static inline void page_table_check_pte_flags(pte_t pte)
@@ -234,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}
-void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
+ unsigned int nr)
{
+ unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
page_table_check_pmd_flags(pmd);
- __page_table_check_pmd_clear(mm, *pmdp);
- if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
- pmd_write(pmd));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pmd_clear(mm, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd))
+ page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
-EXPORT_SYMBOL(__page_table_check_pmd_set);
+EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
+void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
+ unsigned int nr)
{
+ unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, *pudp);
- if (pud_user_accessible_page(pud)) {
- page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
- pud_write(pud));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pud_clear(mm, *(pudp + i));
+ if (pud_user_accessible_page(pud))
+ page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
-EXPORT_SYMBOL(__page_table_check_pud_set);
+EXPORT_SYMBOL(__page_table_check_puds_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 81839a9e74f1..e463c3be934a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -84,6 +84,7 @@ again:
* mapped at the @pvmw->pte
* @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
* for checking
+ * @pte_nr: the number of small pages described by @pvmw->pte.
*
* page_vma_mapped_walk() found a place where pfn range is *potentially*
* mapped. check_pte() has to validate this.
@@ -100,7 +101,7 @@ again:
* Otherwise, return false.
*
*/
-static bool check_pte(struct page_vma_mapped_walk *pvmw)
+static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
{
unsigned long pfn;
pte_t ptent = ptep_get(pvmw->pte);
@@ -111,8 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return false;
entry = pte_to_swp_entry(ptent);
- if (!is_migration_entry(entry) &&
- !is_device_exclusive_entry(entry))
+ if (!is_migration_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
@@ -133,7 +133,11 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(ptent);
}
- return (pfn - pvmw->pfn) < pvmw->nr_pages;
+ if ((pfn + pte_nr - 1) < pvmw->pfn)
+ return false;
+ if (pfn > (pvmw->pfn + pvmw->nr_pages - 1))
+ return false;
+ return true;
}
/* Returns true if the two ranges overlap. Careful to not overflow. */
@@ -208,7 +212,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return false;
pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
- if (!check_pte(pvmw))
+ if (!check_pte(pvmw, pages_per_huge_page(hstate)))
return not_found(pvmw);
return true;
}
@@ -291,7 +295,7 @@ restart:
goto next_pte;
}
this_pte:
- if (check_pte(pvmw))
+ if (check_pte(pvmw, 1))
return true;
next_pte:
do {
diff --git a/mm/percpu.c b/mm/percpu.c
index d8dd31a2e407..b35494c8ede2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
/* allocate chunk */
alloc_size = struct_size(chunk, populated,
BITS_TO_LONGS(region_size >> PAGE_SHIFT));
- chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
INIT_LIST_HEAD(&chunk->list);
@@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
region_bits = pcpu_chunk_map_bits(chunk);
alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
- chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->alloc_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size =
BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
- chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->bound_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
- chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->md_blocks)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
-
+ chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
/* first chunk is free to use */
chunk->obj_exts = NULL;
@@ -1758,7 +1745,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
- is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+ is_atomic = !gfpflags_allow_blocking(gfp);
do_warn = !(gfp & __GFP_NOWARN);
/*
@@ -2204,7 +2191,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
* to grow other chunks. This then gives pcpu_reclaim_populated() time
* to move fully free chunks to the active list to be freed if
* appropriate.
+ *
+ * Enforce GFP_NOIO allocations because we have pcpu_alloc users
+ * constrained to GFP_NOIO/NOFS contexts and they could form lock
+ * dependency through pcpu_alloc_mutex
*/
+ unsigned int flags = memalloc_noio_save();
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
@@ -2215,6 +2207,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
+ memalloc_noio_restore(flags);
}
/**
@@ -2595,28 +2588,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* process group information and build config tables accordingly */
alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
- group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!group_offsets)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
- group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!group_sizes)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
- unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!unit_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
- unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!unit_off)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -2685,12 +2666,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_free_slot = pcpu_sidelined_slot + 1;
pcpu_to_depopulate_slot = pcpu_free_slot + 1;
pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
- pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
sizeof(pcpu_chunk_lists[0]),
SMP_CACHE_BYTES);
- if (!pcpu_chunk_lists)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
@@ -3099,7 +3077,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3155,25 +3133,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
pmd_t *pmd;
if (pgd_none(*pgd)) {
- p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- if (!p4d)
- goto err_alloc;
+ p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
pgd_populate(&init_mm, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
- pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- if (!pud)
- goto err_alloc;
+ pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
p4d_populate(&init_mm, p4d, pud);
}
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
- pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
- if (!pmd)
- goto err_alloc;
+ pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
pud_populate(&init_mm, pud, pmd);
}
@@ -3181,16 +3153,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
if (!pmd_present(*pmd)) {
pte_t *new;
- new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
- if (!new)
- goto err_alloc;
+ new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
pmd_populate_kernel(&init_mm, pmd, new);
}
return;
-
-err_alloc:
- panic("%s: Failed to allocate memory\n", __func__);
}
/**
@@ -3237,10 +3204,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
- if (!pages)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- pages_size);
+ pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);
/* allocate pages */
j = 0;
@@ -3282,7 +3246,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5297dcc38c37..5a882f2b10f9 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif
-pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
unsigned long irqflags;
pmd_t pmdval;
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
new file mode 100644
index 000000000000..7e9455a18aae
--- /dev/null
+++ b/mm/pt_reclaim.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hugetlb.h>
+#include <asm-generic/tlb.h>
+#include <asm/pgalloc.h>
+
+#include "internal.h"
+
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return details && details->reclaim_pt && (end - start >= PMD_SIZE);
+}
+
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
+{
+ spinlock_t *pml = pmd_lockptr(mm, pmd);
+
+ if (!spin_trylock(pml))
+ return false;
+
+ *pmdval = pmdp_get_lockless(pmd);
+ pmd_clear(pmd);
+ spin_unlock(pml);
+
+ return true;
+}
+
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval)
+{
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+}
+
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb)
+{
+ pmd_t pmdval;
+ spinlock_t *pml, *ptl = NULL;
+ pte_t *start_pte, *pte;
+ int i;
+
+ pml = pmd_lock(mm, pmd);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl);
+ if (!start_pte)
+ goto out_ptl;
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ /* Check if it is empty PTE page */
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ free_pte(mm, addr, tlb, pmdval);
+
+ return;
+out_ptl:
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (ptl != pml)
+ spin_unlock(pml);
+}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 106e1d66e9f9..9374f29cdc6f 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -18,7 +18,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
{
struct ptdump_state *st = walk->private;
- st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+ st->note_page_pte(st, addr, kasan_early_shadow_pte[0]);
walk->action = ACTION_CONTINUE;
@@ -38,11 +38,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 0, pgd_val(val));
+ if (st->effective_prot_pgd)
+ st->effective_prot_pgd(st, val);
if (pgd_leaf(val)) {
- st->note_page(st, addr, 0, pgd_val(val));
+ st->note_page_pgd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -61,11 +61,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 1, p4d_val(val));
+ if (st->effective_prot_p4d)
+ st->effective_prot_p4d(st, val);
if (p4d_leaf(val)) {
- st->note_page(st, addr, 1, p4d_val(val));
+ st->note_page_p4d(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -84,11 +84,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 2, pud_val(val));
+ if (st->effective_prot_pud)
+ st->effective_prot_pud(st, val);
if (pud_leaf(val)) {
- st->note_page(st, addr, 2, pud_val(val));
+ st->note_page_pud(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -106,10 +106,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 3, pmd_val(val));
+ if (st->effective_prot_pmd)
+ st->effective_prot_pmd(st, val);
if (pmd_leaf(val)) {
- st->note_page(st, addr, 3, pmd_val(val));
+ st->note_page_pmd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -122,10 +122,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
struct ptdump_state *st = walk->private;
pte_t val = ptep_get_lockless(pte);
- if (st->effective_prot)
- st->effective_prot(st, 4, pte_val(val));
+ if (st->effective_prot_pte)
+ st->effective_prot_pte(st, val);
- st->note_page(st, addr, 4, pte_val(val));
+ st->note_page_pte(st, addr, val);
return 0;
}
@@ -134,9 +134,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
-
- st->note_page(st, addr, depth, 0);
-
+ pte_t pte_zero = {0};
+ pmd_t pmd_zero = {0};
+ pud_t pud_zero = {0};
+ p4d_t p4d_zero = {0};
+ pgd_t pgd_zero = {0};
+
+ switch (depth) {
+ case 4:
+ st->note_page_pte(st, addr, pte_zero);
+ break;
+ case 3:
+ st->note_page_pmd(st, addr, pmd_zero);
+ break;
+ case 2:
+ st->note_page_pud(st, addr, pud_zero);
+ break;
+ case 1:
+ st->note_page_p4d(st, addr, p4d_zero);
+ break;
+ case 0:
+ st->note_page_pgd(st, addr, pgd_zero);
+ break;
+ default:
+ break;
+ }
return 0;
}
@@ -162,7 +184,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
mmap_write_unlock(mm);
/* Flush out the last page */
- st->note_page(st, 0, -1, 0);
+ st->note_page_flush(st);
}
static int check_wx_show(struct seq_file *m, void *v)
diff --git a/mm/readahead.c b/mm/readahead.c
index 8f1cf599b572..20d36d6b055e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac)
if (aops->readahead) {
aops->readahead(rac);
- /*
- * Clean up the remaining folios. The sizes in ->ra
- * may be used to size the next readahead, so make sure
- * they accurately reflect what happened.
- */
+ /* Clean up the remaining folios. */
while ((folio = readahead_folio(rac)) != NULL) {
- unsigned long nr = folio_nr_pages(folio);
-
folio_get(folio);
- rac->ra->size -= nr;
- if (rac->ra->async_size >= nr) {
- rac->ra->async_size -= nr;
- filemap_remove_folio(folio);
- }
+ filemap_remove_folio(folio);
folio_unlock(folio);
folio_put(folio);
}
@@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac)
BUG_ON(readahead_count(rac));
}
+static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
+ gfp_t gfp_mask, unsigned int order)
+{
+ struct folio *folio;
+
+ folio = filemap_alloc_folio(gfp_mask, order);
+ if (folio && ractl->dropbehind)
+ __folio_set_dropbehind(folio);
+
+ return folio;
+}
+
/**
* page_cache_ra_unbounded - Start unchecked readahead.
* @ractl: Readahead control.
@@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
continue;
}
- folio = filemap_alloc_folio(gfp_mask,
- mapping_min_folio_order(mapping));
+ folio = ractl_alloc_folio(ractl, gfp_mask,
+ mapping_min_folio_order(mapping));
if (!folio)
break;
@@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
int err;
- struct folio *folio = filemap_alloc_folio(gfp, order);
+ struct folio *folio = ractl_alloc_folio(ractl, gfp, order);
if (!folio)
return -ENOMEM;
@@ -517,12 +519,18 @@ void page_cache_ra_order(struct readahead_control *ractl,
/*
* If there were already pages in the page cache, then we may have
* left some gaps. Let the regular readahead code take care of this
- * situation.
+ * situation below.
*/
if (!err)
return;
fallback:
- do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size);
+ /*
+ * ->readahead() may have updated readahead window size so we have to
+ * check there's still something to read.
+ */
+ if (ra->size > index - start)
+ do_page_cache_ra(ractl, ra->size - (index - start),
+ ra->async_size);
}
static unsigned long ractl_max_pages(struct readahead_control *ractl,
@@ -647,7 +655,11 @@ void page_cache_async_ra(struct readahead_control *ractl,
1UL << order);
if (index == expected) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max_pages);
+ /*
+ * In the case of MADV_HUGEPAGE, the actual size might exceed
+ * the readahead window.
+ */
+ ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
ra->async_size = ra->size;
goto readit;
}
@@ -678,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -688,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
@@ -751,7 +775,7 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, min_order);
+ folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
if (!folio)
return;
@@ -780,7 +804,7 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, min_order);
+ folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
if (!folio)
return;
diff --git a/mm/rmap.c b/mm/rmap.c
index c6c4d4ea29a7..fb63d9256f09 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -672,7 +672,7 @@ void try_to_unmap_flush_dirty(void)
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -681,7 +681,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
tlb_ubc->flush_required = true;
/*
@@ -757,7 +757,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
}
@@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
* @vma: The VMA we need to know the address in.
*
* Calculates the user virtual address of this page in the specified VMA.
- * It is the caller's responsibililty to check the page is actually
+ * It is the caller's responsibility to check the page is actually
* within the VMA. There may not currently be a PTE pointing at this
* page, but if a page fault occurs at this address, this is the page
* which will be accessed.
@@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio,
const struct page *page, const struct vm_area_struct *vma)
{
if (folio_test_anon(folio)) {
- struct anon_vma *page__anon_vma = folio_anon_vma(folio);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
- if (!vma->anon_vma || !page__anon_vma ||
- vma->anon_vma->root != page__anon_vma->root)
+ if (!vma->anon_vma || !anon_vma ||
+ vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
@@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio,
return -EFAULT;
}
- /* KSM folios don't reach here because of the !page__anon_vma check */
+ /* KSM folios don't reach here because of the !anon_vma check */
return vma_address(vma, page_pgoff(folio, page), 1);
}
@@ -889,7 +889,7 @@ static bool folio_referenced_one(struct folio *folio,
if ((!atomic_read(&vma->vm_mm->mm_users) ||
check_stable_address_space(vma->vm_mm)) &&
folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_likely_mapped_shared(folio)) {
+ !folio_maybe_mapped_shared(folio)) {
pra->referenced = -1;
page_vma_mapped_walk_done(&pvmw);
return false;
@@ -1044,6 +1044,14 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
pte_t entry = ptep_get(pte);
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are clean and not writable from a
+ * CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (!pte_present(entry))
+ continue;
if (!pte_dirty(entry) && !pte_write(entry))
continue;
@@ -1127,6 +1135,80 @@ int folio_mkclean(struct folio *folio)
}
EXPORT_SYMBOL_GPL(folio_mkclean);
+struct wrprotect_file_state {
+ int cleaned;
+ pgoff_t pgoff;
+ unsigned long pfn;
+ unsigned long nr_pages;
+};
+
+static bool mapping_wrprotect_range_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
+{
+ struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = state->pfn,
+ .nr_pages = state->nr_pages,
+ .pgoff = state->pgoff,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
+
+ state->cleaned += page_vma_mkclean_one(&pvmw);
+
+ return true;
+}
+
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked);
+
+/**
+ * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
+ *
+ * @mapping: The mapping whose reverse mapping should be traversed.
+ * @pgoff: The page offset at which @pfn is mapped within @mapping.
+ * @pfn: The PFN of the page mapped in @mapping at @pgoff.
+ * @nr_pages: The number of physically contiguous base pages spanned.
+ *
+ * Traverses the reverse mapping, finding all VMAs which contain a shared
+ * mapping of the pages in the specified range in @mapping, and write-protects
+ * them (that is, updates the page tables to mark the mappings read-only such
+ * that a write protection fault arises when the mappings are written to).
+ *
+ * The @pfn value need not refer to a folio, but rather can reference a kernel
+ * allocation which is mapped into userland. We therefore do not require that
+ * the page maps to a folio with a valid mapping or index field, rather the
+ * caller specifies these in @mapping and @pgoff.
+ *
+ * Return: the number of write-protected PTEs, or an error.
+ */
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+ unsigned long pfn, unsigned long nr_pages)
+{
+ struct wrprotect_file_state state = {
+ .cleaned = 0,
+ .pgoff = pgoff,
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&state,
+ .rmap_one = mapping_wrprotect_range_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
+
+ if (!mapping)
+ return 0;
+
+ __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
+ /* locked = */false);
+
+ return state.cleaned;
+}
+EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
+
/**
* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
@@ -1160,8 +1242,8 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
}
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level,
- int *nr_pmdmapped)
+ struct page *page, int nr_pages, struct vm_area_struct *vma,
+ enum rmap_level level, int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
@@ -1176,6 +1258,16 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
break;
}
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
+ if (nr == orig_nr_pages)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
do {
first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1184,15 +1276,34 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
nr = first;
- atomic_add(orig_nr_pages, &folio->_large_mapcount);
+ folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ if (level == RMAP_LEVEL_PMD && first)
+ *nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_inc_return_large_mapcount(folio, vma);
+ if (nr == 1)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
- *nr_pmdmapped = folio_nr_pages(folio);
- nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ /*
+ * We only track PMD mappings of PMD-sized
+ * folios separately.
+ */
+ if (level == RMAP_LEVEL_PMD)
+ *nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1201,7 +1312,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
- atomic_inc(&folio->_large_mapcount);
+ folio_inc_large_mapcount(folio, vma);
break;
}
return nr;
@@ -1322,7 +1433,7 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
@@ -1338,15 +1449,32 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
case RMAP_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Keep the compiler happy, we don't support anonymous
+ * PUD mappings.
+ */
+ WARN_ON_ONCE(1);
+ break;
}
}
+
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
+ atomic_read(&folio->_mapcount) > 0, folio);
for (i = 0; i < nr_pages; i++) {
struct page *cur_page = page + i;
- /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
- VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
- (folio_test_large(folio) &&
- folio_entire_mapcount(folio) > 1)) &&
+ VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+ folio_entire_mapcount(folio) > 1 &&
+ PageAnonExclusive(cur_page), folio);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ continue;
+
+ /*
+ * While PTE-mapping a THP we have a PMD and a PTE
+ * mapping.
+ */
+ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
PageAnonExclusive(cur_page), folio);
}
@@ -1426,14 +1554,11 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
- const int nr = folio_nr_pages(folio);
const bool exclusive = flags & RMAP_EXCLUSIVE;
- int nr_pmdmapped = 0;
+ int nr = 1, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
- VM_BUG_ON_VMA(address < vma->vm_start ||
- address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
/*
* VM_DROPPABLE mappings don't swap; instead they're just dropped when
@@ -1451,29 +1576,35 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
} else if (!folio_test_pmd_mappable(folio)) {
int i;
+ nr = folio_large_nr_pages(folio);
for (i = 0; i < nr; i++) {
struct page *page = folio_page(folio, i);
- /* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
if (exclusive)
SetPageAnonExclusive(page);
}
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, nr - 1);
- atomic_set(&folio->_nr_pages_mapped, nr);
+ folio_set_large_mapcount(folio, nr, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, nr);
} else {
+ nr = folio_large_nr_pages(folio);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, 0);
- atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
+ folio_set_large_mapcount(folio, 1, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
if (exclusive)
SetPageAnonExclusive(&folio->page);
nr_pmdmapped = nr;
}
+ VM_WARN_ON_ONCE(address < vma->vm_start ||
+ address + (nr << PAGE_SHIFT) > vma->vm_end);
+
__folio_mod_stat(folio, nr, nr_pmdmapped);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
@@ -1486,7 +1617,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
__folio_mod_stat(folio, nr, nr_pmdmapped);
/* See comments in folio_add_anon_rmap_*() */
@@ -1531,6 +1662,27 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @vma: The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
@@ -1548,7 +1700,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
break;
}
- atomic_sub(nr_pages, &folio->_large_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_nr_pages(folio);
+ } else {
+ partially_mapped = nr < folio_large_nr_pages(folio) &&
+ !folio_entire_mapcount(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_sub_large_mapcount(folio, nr_pages, vma);
do {
last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1560,13 +1725,32 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
- atomic_dec(&folio->_large_mapcount);
+ case RMAP_LEVEL_PUD:
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
+ if (level == RMAP_LEVEL_PMD && last)
+ nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_dec_return_large_mapcount(folio, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ } else {
+ partially_mapped = last &&
+ nr < folio_large_nr_pages(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_dec_large_mapcount(folio, vma);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
- nr_pmdmapped = folio_nr_pages(folio);
- nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ if (level == RMAP_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1640,6 +1824,46 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio: The folio to remove the mapping from
+ * @page: The first page to remove
+ * @vma: The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+ struct folio *folio, pte_t *ptep)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = folio_nr_pages(folio);
+ pte_t pte = ptep_get(ptep);
+
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return false;
+ if (pte_unused(pte))
+ return false;
+ if (pte_pfn(pte) != folio_pfn(folio))
+ return false;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ NULL, NULL) == max_nr;
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1648,11 +1872,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1702,9 +1927,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
if (!pvmw.pte) {
- if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
- folio))
- goto walk_done;
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
+ if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
+ goto walk_done;
+ /*
+ * unmap_huge_pmd_locked has either already marked
+ * the folio as swap-backed or decided to retain it
+ * due to GUP or speculative references.
+ */
+ goto walk_abort;
+ }
if (flags & TTU_SPLIT_HUGE_PMD) {
/*
@@ -1712,7 +1944,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* restart so we can process the PTE-mapped THP.
*/
split_huge_pmd_locked(vma, pvmw.address,
- pvmw.pmd, false, folio);
+ pvmw.pmd, false);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
@@ -1722,7 +1954,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
+ } else {
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+ }
+
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1778,24 +2021,33 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else if (likely(pte_present(pteval))) {
+ if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+ can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+ nr_pages = folio_nr_pages(folio);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
}
/*
@@ -1805,10 +2057,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
@@ -1822,8 +2070,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -1868,40 +2116,41 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
smp_rmb();
- /*
- * The only page refs must be one from isolation
- * plus the rmap(s) (dropped by discard:).
- */
- if (ref_count == 1 + map_count &&
- (!folio_test_dirty(folio) ||
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE
- * ones can be dropped even if they've
- * been dirtied.
- */
- (vma->vm_flags & VM_DROPPABLE))) {
- dec_mm_counter(mm, MM_ANONPAGES);
- goto discard;
- }
-
- /*
- * If the folio was redirtied, it cannot be
- * discarded. Remap the page to page table.
- */
- set_pte_at(mm, address, pvmw.pte, pteval);
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE ones
- * never get swap backed on failure to drop.
- */
- if (!(vma->vm_flags & VM_DROPPABLE))
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ /*
+ * redirtied either using the page table or a previously
+ * obtained GUP reference.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
- goto walk_abort;
+ goto walk_abort;
+ } else if (ref_count != 1 + map_count) {
+ /*
+ * Additional reference. Could be a GUP reference or any
+ * speculative reference. GUP users must mark the folio
+ * dirty if there was a modification. This folio cannot be
+ * reclaimed right now either way, so act just like nothing
+ * happened.
+ * We'll come back here later and detect if the folio was
+ * dirtied when the additional reference is gone.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
+ goto walk_abort;
+ }
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
+ goto discard;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
+
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1926,10 +2175,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (anon_exclusive)
swp_pte = pte_swp_mkexclusive(swp_pte);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
/*
@@ -1946,13 +2202,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ folio_ref_sub(folio, nr_pages - 1);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
+ /* We have already batched the entire folio */
+ if (nr_pages > 1)
+ goto walk_done;
continue;
walk_abort:
ret = false;
@@ -2013,9 +2274,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
@@ -2031,13 +2292,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
/*
- * unmap_page() in mm/huge_memory.c is the only user of migration with
- * TTU_SPLIT_HUGE_PMD and it wants to freeze.
- */
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, folio);
-
- /*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
@@ -2062,9 +2316,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, true);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
subpage = folio_page(folio,
pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
@@ -2076,30 +2337,25 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
continue;
- }
#endif
+ }
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
-
- if (folio_is_zone_device(folio)) {
- /*
- * Our PTE is a non-present device exclusive entry and
- * calculating the subpage as for the common case would
- * result in an invalid pointer.
- *
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
- subpage = &folio->page;
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
} else {
- subpage = folio_page(folio, pfn - folio_pfn(folio));
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -2155,7 +2411,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else if (likely(pte_present(pteval))) {
flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
@@ -2169,58 +2428,27 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval, address);
+ set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
+ writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
}
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
+ VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
+ !anon_exclusive, folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_device_private(folio)) {
- unsigned long pfn = folio_pfn(folio);
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (anon_exclusive)
- WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
- subpage));
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = pte_to_swp_entry(pteval);
- if (is_writable_device_private_entry(entry))
- entry = make_writable_migration_entry(pfn);
- else if (anon_exclusive)
- entry = make_readable_exclusive_migration_entry(pfn);
- else
- entry = make_readable_migration_entry(pfn);
- swp_pte = swp_entry_to_pte(entry);
+ if (PageHWPoison(subpage)) {
+ VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- folio_order(folio));
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
- } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -2230,8 +2458,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -2247,6 +2475,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte,
@@ -2257,8 +2490,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
- !anon_exclusive, subpage);
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
if (folio_test_hugetlb(folio)) {
@@ -2283,7 +2514,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- if (pte_write(pteval))
+ if (writable)
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else if (anon_exclusive)
@@ -2292,15 +2523,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
- if (pte_young(pteval))
- entry = make_migration_entry_young(entry);
- if (pte_dirty(pteval))
- entry = make_migration_entry_dirty(entry);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
hsz);
@@ -2375,190 +2614,139 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
}
#ifdef CONFIG_DEVICE_PRIVATE
-struct make_exclusive_args {
- struct mm_struct *mm;
- unsigned long address;
- void *owner;
- bool valid;
-};
-
-static bool page_make_device_exclusive_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long address, void *priv)
+/**
+ * make_device_exclusive() - Mark a page for exclusive use by a device
+ * @mm: mm_struct of associated target process
+ * @addr: the virtual address to mark for exclusive device access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive PFN swap entry, preventing access through the process
+ * page tables. The function will return with the folio locked and referenced.
+ *
+ * On fault, the device-exclusive entries are replaced with the original PTE
+ * under folio lock, after calling MMU notifiers.
+ *
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the folio lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ *
+ * Notes:
+ * #. This function always operates on individual PTEs mapping individual
+ * pages. PMD-sized THPs are first remapped to be mapped by PTEs before
+ * the conversion happens on a single PTE corresponding to @addr.
+ * #. While concurrent access through the process page tables is prevented,
+ * concurrent access through other page references (e.g., earlier GUP
+ * invocation) is not handled and not supported.
+ * #. device-exclusive entries are considered "clean" and "old" by core-mm.
+ * Device drivers must update the folio state when informed by MMU
+ * notifiers.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
+ */
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- struct make_exclusive_args *args = priv;
- pte_t pteval;
- struct page *subpage;
- bool ret = true;
struct mmu_notifier_range range;
+ struct folio *folio, *fw_folio;
+ struct vm_area_struct *vma;
+ struct folio_walk fw;
+ struct page *page;
swp_entry_t entry;
pte_t swp_pte;
- pte_t ptent;
-
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
- vma->vm_mm, address, min(vma->vm_end,
- address + folio_size(folio)),
- args->owner);
- mmu_notifier_invalidate_range_start(&range);
-
- while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
- ptent = ptep_get(pvmw.pte);
- if (!pte_present(ptent)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ int ret;
- subpage = folio_page(folio,
- pte_pfn(ptent) - folio_pfn(folio));
- address = pvmw.address;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(ptent));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
-
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
- /*
- * Check that our target page is still mapped at the expected
- * address.
- */
- if (args->mm == mm && args->address == address &&
- pte_write(pteval))
- args->valid = true;
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- if (pte_write(pteval))
- entry = make_writable_device_exclusive_entry(
- page_to_pfn(subpage));
- else
- entry = make_readable_device_exclusive_entry(
- page_to_pfn(subpage));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
-
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ mmap_assert_locked(mm);
+ addr = PAGE_ALIGN_DOWN(addr);
- /*
- * There is a reference on the page for the swap entry which has
- * been removed, so shouldn't take another.
- */
- folio_remove_rmap_pte(folio, subpage, vma);
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by a device,
+ * the GUP call would undo that first by triggering a fault.
+ *
+ * If any other device would already map this page exclusively, the
+ * fault will trigger a conversion to an ordinary
+ * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
+ */
+retry:
+ page = get_user_page_vma_remote(mm, addr,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ &vma);
+ if (IS_ERR(page))
+ return page;
+ folio = page_folio(page);
+
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
}
- mmu_notifier_invalidate_range_end(&range);
-
- return ret;
-}
-
-/**
- * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
- * @folio: The folio to replace page table entries for.
- * @mm: The mm_struct where the folio is expected to be mapped.
- * @address: Address where the folio is expected to be mapped.
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
- *
- * Tries to remove all the page table entries which are mapping this
- * folio and replace them with special device exclusive swap entries to
- * grant a device exclusive access to the folio.
- *
- * Context: Caller must hold the folio lock.
- * Return: false if the page is still mapped, or if it could not be unmapped
- * from the expected address. Otherwise returns true (success).
- */
-static bool folio_make_device_exclusive(struct folio *folio,
- struct mm_struct *mm, unsigned long address, void *owner)
-{
- struct make_exclusive_args args = {
- .mm = mm,
- .address = address,
- .owner = owner,
- .valid = false,
- };
- struct rmap_walk_control rwc = {
- .rmap_one = page_make_device_exclusive_one,
- .done = folio_not_mapped,
- .anon_lock = folio_lock_anon_vma_read,
- .arg = &args,
- };
+ ret = folio_lock_killable(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
/*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
+ * Inform secondary MMUs that we are going to convert this PTE to
+ * device-exclusive, such that they unmap it now. Note that the
+ * caller must filter this event out to prevent livelocks.
*/
- if (!folio_test_anon(folio))
- return false;
-
- rmap_walk(folio, &rwc);
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mm, addr, addr + PAGE_SIZE, owner);
+ mmu_notifier_invalidate_range_start(&range);
- return args.valid && !folio_mapcount(folio);
-}
+ /*
+ * Let's do a second walk and make sure we still find the same page
+ * mapped writable. Note that any page of an anonymous folio can
+ * only be mapped writable using exactly one PTE ("exclusive"), so
+ * there cannot be other mappings.
+ */
+ fw_folio = folio_walk_start(&fw, vma, addr, 0);
+ if (fw_folio != folio || fw.page != page ||
+ fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
+ if (fw_folio)
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ folio_unlock(folio);
+ folio_put(folio);
+ goto retry;
+ }
-/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
- *
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
- *
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
- *
- * A driver using this to program access from a device must use a mmu notifier
- * critical section to hold a device specific lock during programming. Once
- * programming is complete it should drop the page lock and reference after
- * which point CPU access to the page will revoke the exclusive access.
- */
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
-{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
-
- npages = get_user_pages_remote(mm, start, npages,
- FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ /* Nuke the page table entry so we get the uptodate dirty bit. */
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
- }
+ /* Set the dirty flag on the folio now the PTE is gone. */
+ if (pte_dirty(fw.pte))
+ folio_mark_dirty(folio);
- return npages;
+ /*
+ * Store the pfn of the page in a special device-exclusive PFN swap PTE.
+ * do_swap_page() will trigger the conversion back while holding the
+ * folio lock.
+ */
+ entry = make_device_exclusive_entry(page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(fw.pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ /* The pte is writable, uffd-wp does not apply. */
+ set_pte_at(mm, addr, fw.ptep, swp_pte);
+
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -2653,35 +2841,37 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_unlock_read(anon_vma);
}
-/*
- * rmap_walk_file - do something to file page using the object-based rmap method
- * @folio: the folio to be handled
- * @rwc: control variable according to each walk type
- * @locked: caller holds relevant rmap lock
+/**
+ * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
+ * of a page mapped within a specified page cache object at a specified offset.
*
- * Find all the mappings of a folio using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
+ * @folio: Either the folio whose mappings to traverse, or if NULL,
+ * the callbacks specified in @rwc will be configured such
+ * as to be able to look up mappings correctly.
+ * @mapping: The page cache object whose mapping VMAs we intend to
+ * traverse. If @folio is non-NULL, this should be equal to
+ * folio_mapping(folio).
+ * @pgoff_start: The offset within @mapping of the page which we are
+ * looking up. If @folio is non-NULL, this should be equal
+ * to folio_pgoff(folio).
+ * @nr_pages: The number of pages mapped by the mapping. If @folio is
+ * non-NULL, this should be equal to folio_nr_pages(folio).
+ * @rwc: The reverse mapping walk control object describing how
+ * the traversal should proceed.
+ * @locked: Is the @mapping already locked? If not, we acquire the
+ * lock.
*/
-static void rmap_walk_file(struct folio *folio,
- struct rmap_walk_control *rwc, bool locked)
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = folio_mapping(folio);
- pgoff_t pgoff_start, pgoff_end;
+ pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
struct vm_area_struct *vma;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_rwsem.
- */
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
+ VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
+ VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
- if (!mapping)
- return;
-
- pgoff_start = folio_pgoff(folio);
- pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
@@ -2696,8 +2886,7 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(vma, pgoff_start,
- folio_nr_pages(folio));
+ unsigned long address = vma_address(vma, pgoff_start, nr_pages);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2710,12 +2899,38 @@ lookup:
if (rwc->done && rwc->done(folio))
goto done;
}
-
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @folio: the folio to be handled
+ * @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
+ *
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ */
+static void rmap_walk_file(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
+{
+ /*
+ * The folio lock not only makes sure that folio->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the structure
+ * at mapping cannot be freed and reused yet, so we can safely take
+ * mapping->i_mmap_rwsem.
+ */
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (!folio->mapping)
+ return;
+
+ __rmap_walk_file(folio, folio->mapping, folio->index,
+ folio_nr_pages(folio), rwc, locked);
+}
+
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 6d783436951f..e7173fcd210c 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -12,7 +12,8 @@
#include <linux/mm.h>
#include <asm/sections.h>
-static const int rodata_test_data = 0xC3;
+#define TEST_VALUE 0xC3
+static const int rodata_test_data = TEST_VALUE;
void rodata_test(void)
{
@@ -20,7 +21,7 @@ void rodata_test(void)
/* test 1: read the value */
/* If this test fails, some previous testrun has clobbered the state */
- if (!rodata_test_data) {
+ if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) {
pr_err("test 1 fails (start data)\n");
return;
}
@@ -33,7 +34,7 @@ void rodata_test(void)
}
/* test 3: check the value hasn't changed */
- if (rodata_test_data == zero) {
+ if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) {
pr_err("test data was changed\n");
return;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 399552814fd0..9a11a38a6770 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,18 +120,18 @@ static int secretmem_release(struct inode *inode, struct file *file)
return 0;
}
-static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
- unsigned long len = vma->vm_end - vma->vm_start;
+ const unsigned long len = desc->end - desc->start;
- if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
return -EAGAIN;
- vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
- vma->vm_ops = &secretmem_vm_ops;
+ desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ desc->vm_ops = &secretmem_vm_ops;
return 0;
}
@@ -143,7 +143,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma)
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
- .mmap = secretmem_mmap,
+ .mmap_prepare = secretmem_mmap_prepare,
};
static int secretmem_migrate_folio(struct address_space *mapping,
@@ -195,19 +195,11 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
- int err;
- inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &qname, NULL);
- if (err) {
- file = ERR_PTR(err);
- goto err_free_inode;
- }
-
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
O_RDWR, &secretmem_fops);
if (IS_ERR(file))
diff --git a/mm/shmem.c b/mm/shmem.c
index ccb9629a0f70..3a5a65b1f41a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include "internal.h"
-#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
/* Pretend that each entry is of this size in directory's i_size */
@@ -99,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -108,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -447,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
@@ -526,9 +525,9 @@ static bool shmem_confirm_swap(struct address_space *mapping,
* enables huge pages for the mount;
* SHMEM_HUGE_WITHIN_SIZE:
* only allocate huge pages if the page will be fully within i_size,
- * also respect fadvise()/madvise() hints;
+ * also respect madvise() hints;
* SHMEM_HUGE_ADVISE:
- * only allocate huge pages if requested with fadvise()/madvise();
+ * only allocate huge pages if requested with madvise();
*/
#define SHMEM_HUGE_NEVER 0
@@ -553,38 +552,119 @@ static bool shmem_confirm_swap(struct address_space *mapping,
/* ifdef here to avoid bloating shmem.o when not necessary */
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+/**
+ * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
+ * @mapping: Target address_space.
+ * @index: The page index.
+ * @write_end: end of a write, could extend inode size.
+ *
+ * This returns huge orders for folios (when supported) based on the file size
+ * which the mapping currently allows at the given index. The index is relevant
+ * due to alignment considerations the mapping might have. The returned order
+ * may be less than the size passed.
+ *
+ * Return: The orders.
+ */
+static inline unsigned int
+shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
+{
+ unsigned int order;
+ size_t size;
+
+ if (!mapping_large_folio_support(mapping) || !write_end)
+ return 0;
+
+ /* Calculate the write size based on the write_end */
+ size = write_end - (index << PAGE_SHIFT);
+ order = filemap_get_order(size);
+ if (!order)
+ return 0;
+
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
+
+ order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
+ return order > 0 ? BIT(order + 1) - 1 : 0;
+}
+
+static unsigned int shmem_get_orders_within_size(struct inode *inode,
+ unsigned long within_size_orders, pgoff_t index,
+ loff_t write_end)
{
+ pgoff_t aligned_index;
+ unsigned long order;
loff_t i_size;
- if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
- return false;
+ order = highest_order(within_size_orders);
+ while (within_size_orders) {
+ aligned_index = round_up(index + 1, 1 << order);
+ i_size = max(write_end, i_size_read(inode));
+ i_size = round_up(i_size, PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= aligned_index)
+ return within_size_orders;
+
+ order = next_order(&within_size_orders, order);
+ }
+
+ return 0;
+}
+
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
+ 0 : BIT(HPAGE_PMD_ORDER);
+ unsigned long within_size_orders;
+
if (!S_ISREG(inode->i_mode))
- return false;
+ return 0;
if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
+ return 0;
if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
- return true;
+ return maybe_pmd_order;
+ /*
+ * The huge order allocation for anon shmem is controlled through
+ * the mTHP interface, so we still use PMD-sized huge order to
+ * check whether global control is enabled.
+ *
+ * For tmpfs mmap()'s huge order, we still use PMD-sized order to
+ * allocate huge pages due to lack of a write size hint.
+ *
+ * Otherwise, tmpfs will allow getting a highest order hint based on
+ * the size of write and fallocate paths, then will try each allowable
+ * huge orders.
+ */
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
- return true;
+ if (vma)
+ return maybe_pmd_order;
+
+ return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
case SHMEM_HUGE_WITHIN_SIZE:
- index = round_up(index + 1, HPAGE_PMD_NR);
- i_size = max(write_end, i_size_read(inode));
- i_size = round_up(i_size, PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index)
- return true;
+ if (vma)
+ within_size_orders = maybe_pmd_order;
+ else
+ within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
+ index, write_end);
+
+ within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
+ index, write_end);
+ if (within_size_orders > 0)
+ return within_size_orders;
+
fallthrough;
case SHMEM_HUGE_ADVISE:
if (vm_flags & VM_HUGEPAGE)
- return true;
+ return maybe_pmd_order;
fallthrough;
default:
- return false;
+ return 0;
}
}
@@ -779,14 +859,23 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
return 0;
}
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
- return false;
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void shmem_update_stats(struct folio *folio, int nr_pages)
+{
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
+}
+
/*
* Somewhat like filemap_add_folio, but error if expected item has gone.
*/
@@ -821,10 +910,7 @@ static int shmem_add_to_page_cache(struct folio *folio,
xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
- if (folio_test_pmd_mappable(folio))
- __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
+ shmem_update_stats(folio, nr);
mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
@@ -852,8 +938,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
error = shmem_replace_entry(mapping, folio->index, folio, radswap);
folio->mapping = NULL;
mapping->nrpages -= nr;
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+ shmem_update_stats(folio, -nr);
xa_unlock_irq(&mapping->i_pages);
folio_put_refs(folio, nr);
BUG_ON(error);
@@ -1176,7 +1261,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1308,9 +1393,9 @@ static void shmem_evict_inode(struct inode *inode)
#endif
}
-static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, struct folio_batch *fbatch,
- pgoff_t *indices, unsigned int type)
+static unsigned int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
struct folio *folio;
@@ -1343,7 +1428,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
}
rcu_read_unlock();
- return xas.xa_index;
+ return folio_batch_count(fbatch);
}
/*
@@ -1361,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1390,8 +1473,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type)
do {
folio_batch_init(&fbatch);
- shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
- if (folio_batch_count(&fbatch) == 0) {
+ if (!shmem_find_swap_entries(mapping, start, &fbatch,
+ indices, type)) {
ret = 0;
break;
}
@@ -1420,6 +1503,7 @@ int shmem_unuse(unsigned int type)
return 0;
mutex_lock(&shmem_swaplist_mutex);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1438,45 +1522,42 @@ int shmem_unuse(unsigned int type)
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
mutex_unlock(&shmem_swaplist_mutex);
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @wbc: How writeback is to be done
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- swp_entry_t swap;
pgoff_t index;
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -1502,9 +1583,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, wbc->list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1531,7 +1611,7 @@ try_split:
!shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
- shmem_falloc->nr_unswapped++;
+ shmem_falloc->nr_unswapped += nr_pages;
else
shmem_falloc = NULL;
spin_unlock(&inode->i_lock);
@@ -1543,14 +1623,6 @@ try_split:
folio_mark_uptodate(folio);
}
- swap = folio_alloc_swap(folio);
- if (!swap.val) {
- if (nr_pages > 1)
- goto try_split;
-
- goto redirty;
- }
-
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the folio is
@@ -1563,20 +1635,20 @@ try_split:
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(folio, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
- NULL) == 0) {
+ if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
shmem_recalc_inode(inode, 0, nr_pages);
- swap_shmem_alloc(swap, nr_pages);
- shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
+ swap_shmem_alloc(folio->swap, nr_pages);
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
+ return swap_writeout(folio, wbc);
}
-
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_folio(folio, swap);
+ if (nr_pages > 1)
+ goto try_split;
redirty:
folio_mark_dirty(folio);
if (wbc->for_reclaim)
@@ -1584,6 +1656,7 @@ redirty:
folio_unlock(folio);
return 0;
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -1685,22 +1758,16 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
unsigned long vm_flags = vma ? vma->vm_flags : 0;
- bool global_huge;
- loff_t i_size;
- int order;
+ unsigned int global_orders;
if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
- global_huge = shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vm_flags);
- if (!vma || !vma_is_anon_shmem(vma)) {
- /*
- * For tmpfs, we now only support PMD sized THP if huge page
- * is enabled, otherwise fallback to order 0.
- */
- return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
- }
+ global_orders = shmem_huge_global_enabled(inode, index, write_end,
+ shmem_huge_force, vma, vm_flags);
+ /* Tmpfs huge pages allocation */
+ if (!vma || !vma_is_anon_shmem(vma))
+ return global_orders;
/*
* Following the 'deny' semantics of the top level, force the huge
@@ -1717,22 +1784,12 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
return READ_ONCE(huge_shmem_orders_inherit);
/* Allow mTHP that will be fully within i_size. */
- order = highest_order(within_size_orders);
- while (within_size_orders) {
- index = round_up(index + 1, order);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index) {
- mask |= within_size_orders;
- break;
- }
-
- order = next_order(&within_size_orders, order);
- }
+ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
- if (global_huge)
+ if (global_orders > 0)
mask |= READ_ONCE(huge_shmem_orders_inherit);
return THP_ORDERS_ALL_FILE_DEFAULT & mask;
@@ -1898,6 +1955,65 @@ unlock:
return ERR_PTR(error);
}
+static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+ struct vm_area_struct *vma, pgoff_t index,
+ swp_entry_t entry, int order, gfp_t gfp)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct folio *new;
+ void *shadow;
+ int nr_pages;
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success with further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
+ gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+
+ gfp = limit_gfp_mask(huge_gfp, gfp);
+ }
+
+ new = shmem_alloc_folio(gfp, order, info, index);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ nr_pages = folio_nr_pages(new);
+ if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
+ gfp, entry)) {
+ folio_put(new);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * Prevent parallel swapin from proceeding with the swap cache flag.
+ *
+ * Of course there is another possible concurrent scenario as well,
+ * that is to say, the swap cache flag of a large folio has already
+ * been set by swapcache_prepare(), while another thread may have
+ * already split the large swap entry stored in the shmem mapping.
+ * In this case, shmem_add_to_page_cache() will help identify the
+ * concurrent swapin and return -EEXIST.
+ */
+ if (swapcache_prepare(entry, nr_pages)) {
+ folio_put(new);
+ return ERR_PTR(-EEXIST);
+ }
+
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ new->swap = entry;
+
+ memcg1_swapin(entry, nr_pages);
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(new, shadow);
+ folio_add_lru(new);
+ swap_read_folio(new, NULL);
+ return new;
+}
+
/*
* When a page is moved from swapcache to shmem filecache (either by the
* usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
@@ -1969,10 +2085,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
if (!error) {
mem_cgroup_replace_folio(old, new);
- __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages);
- __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages);
- __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages);
- __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages);
+ shmem_update_stats(new, nr_pages);
+ shmem_update_stats(old, -nr_pages);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -2003,7 +2117,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
- struct folio *folio, swp_entry_t swap)
+ struct folio *folio, swp_entry_t swap,
+ bool skip_swapcache)
{
struct address_space *mapping = inode->i_mapping;
swp_entry_t swapin_error;
@@ -2019,7 +2134,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
- delete_from_swap_cache(folio);
+ if (!skip_swapcache)
+ delete_from_swap_cache(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2034,15 +2150,16 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
- void *alloced_shadow = NULL;
- int alloced_order = 0, i;
+ int split_order = 0, entry_order;
+ int i;
/* Convert user data gfp flags to xarray node gfp flags */
gfp &= GFP_RECLAIM_MASK;
for (;;) {
- int order = -1, split_order = 0;
void *old = NULL;
+ int cur_order;
+ pgoff_t swap_index;
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -2051,60 +2168,56 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
goto unlock;
}
- order = xas_get_order(&xas);
+ entry_order = xas_get_order(&xas);
- /* Swap entry may have changed before we re-acquire the lock */
- if (alloced_order &&
- (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
+ if (!entry_order)
+ goto unlock;
/* Try to split large swap entry in pagecache */
- if (order > 0) {
- if (!alloced_order) {
- split_order = order;
+ cur_order = entry_order;
+ swap_index = round_down(index, 1 << entry_order);
+
+ split_order = xas_try_split_min_order(cur_order);
+
+ while (cur_order > 0) {
+ pgoff_t aligned_index =
+ round_down(index, 1 << cur_order);
+ pgoff_t swap_offset = aligned_index - swap_index;
+
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, cur_order);
+ if (xas_error(&xas))
goto unlock;
- }
- xas_split(&xas, old, order);
/*
* Re-set the swap entry after splitting, and the swap
* offset of the original large entry must be continuous.
*/
- for (i = 0; i < 1 << order; i++) {
- pgoff_t aligned_index = round_down(index, 1 << order);
+ for (i = 0; i < 1 << cur_order;
+ i += (1 << split_order)) {
swp_entry_t tmp;
- tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
+ tmp = swp_entry(swp_type(swap),
+ swp_offset(swap) + swap_offset +
+ i);
__xa_store(&mapping->i_pages, aligned_index + i,
swp_to_radix_entry(tmp), 0);
}
+ cur_order = split_order;
+ split_order = xas_try_split_min_order(split_order);
}
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
-error:
if (xas_error(&xas))
return xas_error(&xas);
- return alloced_order;
+ return entry_order;
}
/*
@@ -2123,8 +2236,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct swap_info_struct *si;
struct folio *folio = NULL;
+ bool skip_swapcache = false;
swp_entry_t swap;
- int error, nr_pages;
+ int error, nr_pages, order, split_order;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@@ -2143,8 +2257,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
+ order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int split_order;
+ int nr_pages = 1 << order;
+ bool fallback_order0 = false;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -2154,6 +2270,36 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
}
/*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
+ */
+ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled() ||
+ non_swapcache_batch(swap, nr_pages) != nr_pages))
+ fallback_order0 = true;
+
+ /* Skip swapcache for synchronous device. */
+ if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
+ if (!IS_ERR(folio)) {
+ skip_swapcache = true;
+ goto alloced;
+ }
+
+ /*
+ * Fallback to swapin order-0 folio unless the swap entry
+ * already exists.
+ */
+ error = PTR_ERR(folio);
+ folio = NULL;
+ if (error == -EEXIST)
+ goto failed;
+ }
+
+ /*
* Now swap device can only swap in order 0 folio, then we
* should split the large swap entry stored in the pagecache
* if necessary.
@@ -2181,13 +2327,40 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
+ } else if (order != folio_order(folio)) {
+ /*
+ * Swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
+ */
+ split_order = shmem_split_large_entry(inode, index, swap, gfp);
+ if (split_order < 0) {
+ folio_put(folio);
+ folio = NULL;
+ error = split_order;
+ goto failed;
+ }
+
+ /*
+ * If the large swap entry has already been split, it is
+ * necessary to recalculate the new swap entry based on
+ * the old order alignment.
+ */
+ if (split_order > 0) {
+ pgoff_t offset = index - round_down(index, 1 << split_order);
+
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
}
+alloced:
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
- if (!folio_test_swapcache(folio) ||
+ if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ !shmem_confirm_swap(mapping, index, swap) ||
+ xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST;
goto unlock;
}
@@ -2221,7 +2394,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- delete_from_swap_cache(folio);
+ if (skip_swapcache) {
+ folio->swap.val = 0;
+ swapcache_clear(si, swap, nr_pages);
+ } else {
+ delete_from_swap_cache(folio);
+ }
folio_mark_dirty(folio);
swap_free_nr(swap, nr_pages);
put_swap_device(si);
@@ -2232,8 +2410,11 @@ failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
if (error == -EIO)
- shmem_set_folio_swapin_error(inode, index, folio, swap);
+ shmem_set_folio_swapin_error(inode, index, folio, swap,
+ skip_swapcache);
unlock:
+ if (skip_swapcache)
+ swapcache_clear(si, swap, folio_nr_pages(folio));
if (folio) {
folio_unlock(folio);
folio_put(folio);
@@ -2749,12 +2930,6 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
- int ret;
-
- ret = seal_check_write(info->seals, vma);
- if (ret)
- return ret;
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
@@ -3118,8 +3293,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
if (ret)
return ret;
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
@@ -3326,7 +3500,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3353,7 +3527,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3440,7 +3614,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
@@ -3597,7 +3771,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -3728,16 +3902,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -3818,7 +3992,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
- if (!simple_offset_empty(dentry))
+ if (!simple_empty(dentry))
return -ENOTEMPTY;
drop_nlink(d_inode(dentry));
@@ -3875,7 +4049,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
return simple_offset_rename_exchange(old_dir, old_dentry,
new_dir, new_dentry);
- if (!simple_offset_empty(new_dentry))
+ if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
@@ -3914,6 +4088,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
int len;
struct inode *inode;
struct folio *folio;
+ char *link;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3935,12 +4110,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
- inode->i_link = kmemdup(symname, len, GFP_KERNEL);
- if (!inode->i_link) {
+ link = kmemdup(symname, len, GFP_KERNEL);
+ if (!link) {
error = -ENOMEM;
goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
+ inode_set_cached_link(inode, link, len - 1);
} else {
inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops;
@@ -4365,7 +4541,7 @@ static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *
bool latest_version)
{
struct shmem_options *ctx = fc->fs_private;
- unsigned int version = UTF8_LATEST;
+ int version = UTF8_LATEST;
struct unicode_map *encoding;
char *version_str = param->string + 5;
@@ -4580,48 +4756,37 @@ bad_value:
return invalfc(fc, "Bad value for '%s'", param->key);
}
-static int shmem_parse_options(struct fs_context *fc, void *data)
+static char *shmem_next_opt(char **s)
{
- char *options = data;
+ char *sbegin = *s;
+ char *p;
- if (options) {
- int err = security_sb_eat_lsm_opts(options, &fc->security);
- if (err)
- return err;
- }
+ if (sbegin == NULL)
+ return NULL;
- while (options != NULL) {
- char *this_char = options;
- for (;;) {
- /*
- * NUL-terminate this option: unfortunately,
- * mount options form a comma-separated list,
- * but mpol's nodelist may also contain commas.
- */
- options = strchr(options, ',');
- if (options == NULL)
- break;
- options++;
- if (!isdigit(*options)) {
- options[-1] = '\0';
- break;
- }
- }
- if (*this_char) {
- char *value = strchr(this_char, '=');
- size_t len = 0;
- int err;
-
- if (value) {
- *value++ = '\0';
- len = strlen(value);
- }
- err = vfs_parse_fs_string(fc, this_char, value, len);
- if (err < 0)
- return err;
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ for (;;) {
+ p = strchr(*s, ',');
+ if (p == NULL)
+ break;
+ *s = p + 1;
+ if (!isdigit(*(p+1))) {
+ *p = '\0';
+ return sbegin;
}
}
- return 0;
+
+ *s = NULL;
+ return sbegin;
+}
+
+static int shmem_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
}
/*
@@ -4888,7 +5053,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->gid = ctx->gid;
sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
- sbinfo->huge = ctx->huge;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ else
+ sbinfo->huge = tmpfs_huge;
+#endif
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
@@ -4966,7 +5136,7 @@ static const struct fs_context_operations shmem_fs_context_ops = {
.free = shmem_free_fc,
.get_tree = shmem_get_tree,
#ifdef CONFIG_TMPFS
- .parse_monolithic = shmem_parse_options,
+ .parse_monolithic = shmem_parse_monolithic,
.parse_param = shmem_parse_one,
.reconfigure = shmem_reconfigure,
#endif
@@ -5024,7 +5194,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
@@ -5439,6 +5608,21 @@ static int __init setup_transparent_hugepage_shmem(char *str)
}
__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
+static int __init setup_transparent_hugepage_tmpfs(char *str)
+{
+ int huge;
+
+ huge = shmem_parse_huge(str);
+ if (huge < 0) {
+ pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
+ return huge;
+ }
+
+ tmpfs_huge = huge;
+ return 1;
+}
+__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
+
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_shmem(char *str)
{
@@ -5479,19 +5663,19 @@ static int __init setup_thp_shmem(char *str)
THP_ORDERS_ALL_FILE_DEFAULT);
}
- if (start == -EINVAL) {
+ if (start < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
start_size);
goto err;
}
- if (end == -EINVAL) {
+ if (end < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
end_size);
goto err;
}
- if (start < 0 || end < 0 || start > end)
+ if (start > end)
goto err;
nr = end - start + 1;
@@ -5628,12 +5812,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
@@ -5658,7 +5842,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
* underlying inode. So users of this interface must do LSM checks at a
* higher layer. The users are the big_key and shm implementations. LSM
* checks are provided at the key or shm level rather than the inode.
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5670,7 +5854,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/**
* shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5683,7 +5867,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
* @mnt: the tmpfs mount where the file will be created
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 43afb56abbd3..0cf8bf5d832d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
-
+ managed_pages += zone_managed_pages(zone);
if (is_highmem(zone)) {
managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
+
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
val->mem_unit = PAGE_SIZE;
}
#endif
@@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -260,6 +254,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" pagetables:%lukB"
" sec_pagetables:%lukB"
" all_unreclaimable? %s"
+ " Balloon:%lukB"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
@@ -285,7 +280,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES));
+ str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES),
+ K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
for_each_populated_zone(zone) {
@@ -309,6 +305,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" low:%lukB"
" high:%lukB"
" reserved_highatomic:%luKB"
+ " free_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
@@ -330,6 +327,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic),
+ K(zone->nr_free_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
@@ -339,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 4a85b94d12ce..20eaee3e97f7 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -195,8 +195,6 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
{
- struct dentry *entry;
- char buf[128];
const char *new, *old;
va_list ap;
int ret = 0;
@@ -213,23 +211,17 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
old = shrinker->name;
shrinker->name = new;
- if (shrinker->debugfs_entry) {
- snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
- shrinker->debugfs_id);
-
- entry = debugfs_rename(shrinker_debugfs_root,
- shrinker->debugfs_entry,
- shrinker_debugfs_root, buf);
- if (IS_ERR(entry))
- ret = PTR_ERR(entry);
- else
- shrinker->debugfs_entry = entry;
- }
+ ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d",
+ shrinker->name, shrinker->debugfs_id);
+ if (ret) {
+ shrinker->name = old;
+ kfree_const(new);
+ } else {
+ kfree_const(old);
+ }
mutex_unlock(&shrinker_mutex);
- kfree_const(old);
-
return ret;
}
EXPORT_SYMBOL(shrinker_debugfs_rename);
diff --git a/mm/slab.h b/mm/slab.h
index 632fedd71fea..05a21dc796e0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -128,7 +128,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
/**
* slab_folio - The folio allocated for a slab
- * @slab: The slab.
+ * @s: The slab.
*
* Slabs are allocated as folios that contain the individual objects and are
* using some fields in the first struct page of the folio - those fields are
@@ -159,7 +159,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
/**
* slab_page - The first struct page allocated for a slab
- * @slab: The slab.
+ * @s: The slab.
*
* A convenience wrapper for converting slab to the first struct page of the
* underlying folio, to communicate with code not yet converted to folio or
@@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
-/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
- SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
-#else
-#define SLAB_DEBUG_FLAGS (0)
-#endif
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-
-/* Common flags available with current configuration */
-#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
-
-/* Common flags permitted for kmem_cache_create */
-#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
- SLAB_RED_ZONE | \
- SLAB_POISON | \
- SLAB_STORE_USER | \
- SLAB_TRACE | \
- SLAB_CONSISTENCY_CHECKS | \
- SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | \
- SLAB_ACCOUNT | \
- SLAB_KMALLOC | \
- SLAB_NO_MERGE | \
- SLAB_NO_USER_FLAGS)
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
#endif
+void kvfree_rcu_cb(struct rcu_head *head);
+
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a29457bef626..bfe7c40eeee1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -2,7 +2,7 @@
/*
* Slab allocator functions that are independent of the allocator strategy
*
- * (C) 2012 Christoph Lameter <cl@linux.com>
+ * (C) 2012 Christoph Lameter <cl@gentwo.org>
*/
#include <linux/slab.h>
@@ -28,7 +28,9 @@
#include <asm/page.h>
#include <linux/memcontrol.h>
#include <linux/stackdepot.h>
+#include <trace/events/rcu.h>
+#include "../kernel/rcu/rcu.h"
#include "internal.h"
#include "slab.h"
@@ -296,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
+#else
+ flags &= ~SLAB_DEBUG_FLAGS;
#endif
mutex_lock(&slab_mutex);
@@ -305,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
goto out_unlock;
}
- /* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
- /*
- * Some allocators will constraint the set of valid flags to a subset
- * of all flags. We expect them to define CACHE_CREATE_MASK in this
- * case, and we'll just provide them with a sanitized version of the
- * passed flags.
- */
- flags &= CACHE_CREATE_MASK;
-
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
WARN_ON(!args->usersize && args->useroffset) ||
@@ -1282,3 +1277,908 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head) {
+ kasan_record_aux_stack(ptr);
+ call_rcu(head, kvfree_rcu_cb);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree(ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+void __init kvfree_rcu_init(void)
+{
+}
+
+#else /* CONFIG_KVFREE_RCU_BATCHED */
+
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 5;
+module_param(rcu_min_cached_objs, int, 0444);
+
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
+static struct workqueue_struct *rcu_reclaim_wq;
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
+#define KFREE_N_BATCHES 2
+#define FREE_N_CHANNELS 2
+
+/**
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
+ * @nr_records: Number of active pointers in the array
+ * @records: Array of the kvfree_rcu() pointers
+ */
+struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ struct rcu_gp_oldstate gp_snap;
+ unsigned long nr_records;
+ void *records[] __counted_by(nr_records);
+};
+
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+ struct rcu_work rcu_work;
+ struct rcu_head *head_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
+ struct kfree_rcu_cpu *krcp;
+};
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @initialized: The @rcu_work fields have been initialized
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
+ * @bkvcache:
+ * A simple cache list that contains objects for reuse purpose.
+ * In order to save some per-cpu space the list is singular.
+ * Even though it is lockless an access has to be protected by the
+ * per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
+ * @nr_bkv_objs: number of allocated objects at @bkvcache.
+ *
+ * This is a per-CPU structure. The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files. Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
+ */
+struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
+ struct rcu_head *head;
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
+ struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+ raw_spinlock_t lock;
+ struct delayed_work monitor_work;
+ bool initialized;
+
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
+
+static __always_inline void
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
+#endif
+}
+
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
+ return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
+
+ if (!rcu_min_cached_objs)
+ return 0;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
+
+ return freed;
+}
+
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ "slab", bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ "slab", bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+ }
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback("slab", head, offset);
+
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+ unsigned long flags;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
+ struct kfree_rcu_cpu *krcp;
+ struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
+ int i;
+
+ krwp = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu_work, rcu_work);
+ krcp = krwp->krcp;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
+
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle the first two channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ /*
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
+ */
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
+}
+
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krcp->bulk_head[i]))
+ return true;
+
+ return !!READ_ONCE(krcp->head);
+}
+
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
+}
+
+static void
+__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+ return;
+ }
+ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ __schedule_delayed_monitor_work(krcp);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
+}
+
+/*
+ * Return: %true if a work is queued, %false otherwise.
+ */
+static bool
+kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ bool queued = false;
+ int i, j;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+
+ // Attempt to start a new batch.
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
+
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
+ }
+ }
+
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
+ if (!krwp->head_free) {
+ krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. Break
+ // the loop since it is done with this CPU thus
+ // queuing an RCU work is _always_ success here.
+ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
+ WARN_ON_ONCE(!queued);
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ return queued;
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
+
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
+
+ // Queue a batch for a rest.
+ kvfree_rcu_queue_batch(krcp);
+
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work.work);
+ unsigned long flags;
+ int nr_pages;
+ bool pushed;
+ int i;
+
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!bnode)
+ break;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
+}
+
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags. If can_alloc is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// Returns true if ptr was successfully recorded, else the caller must
+// use a fallback.
+static inline bool
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+ unsigned long *flags, void *ptr, bool can_alloc)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
+
+ *krcp = krc_this_cpu_lock(flags);
+ if (unlikely(!(*krcp)->initialized))
+ return false;
+
+ idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
+
+ /* Check if a new block is required. */
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(*krcp);
+ if (!bnode && can_alloc) {
+ krc_this_cpu_unlock(*krcp, *flags);
+
+ // __GFP_NORETRY - allows a light-weight direct reclaim
+ // what is OK from minimizing of fallback hitting point of
+ // view. Apart of that it forbids any OOM invoking what is
+ // also beneficial since we are about to release memory soon.
+ //
+ // __GFP_NOMEMALLOC - prevents from consuming of all the
+ // memory reserves. Please note we have a fallback path.
+ //
+ // __GFP_NOWARN - it is supposed that an allocation can
+ // be failed under low memory or high memory pressure
+ // scenarios.
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+ }
+
+ if (!bnode)
+ return false;
+
+ // Initialize the new block and attach it.
+ bnode->nr_records = 0;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
+ }
+
+ // Finally insert and update the GP for this page.
+ bnode->nr_records++;
+ bnode->records[bnode->nr_records - 1] = ptr;
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
+ atomic_inc(&(*krcp)->bulk_count[idx]);
+
+ return true;
+}
+
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
+ return HRTIMER_NORESTART;
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(rcu_reclaim_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+ }
+}
+
+void __init kfree_rcu_scheduler_running(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
+ }
+}
+
+/*
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
+ *
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
+ * be free'd in workqueue context. This allows us to: batch requests together to
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
+ */
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp;
+ bool success;
+
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
+ might_sleep();
+
+ // Queue the object but don't yet schedule the batch.
+ if (debug_rcu_head_queue(ptr)) {
+ // Probable double kfree_rcu(), just leak.
+ WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+ __func__, head);
+
+ // Mark as success and leave.
+ return;
+ }
+
+ kasan_record_aux_stack(ptr);
+ success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+ if (!success) {
+ run_page_cache_worker(krcp);
+
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
+ head->func = ptr;
+ head->next = krcp->head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
+ success = true;
+ }
+
+ /*
+ * The kvfree_rcu() caller considers the pointer freed at this point
+ * and likely removes any references to it. Since the actual slab
+ * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+ * this object (no scanning or false positives reporting).
+ */
+ kmemleak_ignore(ptr);
+
+ // Set timer to drain after KFREE_DRAIN_JIFFIES.
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+ __schedule_delayed_monitor_work(krcp);
+
+unlock_return:
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+ struct kfree_rcu_cpu_work *krwp;
+ struct kfree_rcu_cpu *krcp;
+ bool queued;
+ int i, cpu;
+
+ /*
+ * Firstly we detach objects and queue them over an RCU-batch
+ * for all CPUs. Finally queued works are flushed for each CPU.
+ *
+ * Please note. If there are outstanding batches for a particular
+ * CPU, those have to be finished first following by queuing a new.
+ */
+ for_each_possible_cpu(cpu) {
+ krcp = per_cpu_ptr(&krc, cpu);
+
+ /*
+ * Check if this CPU has any objects which have been queued for a
+ * new GP completion. If not(means nothing to detach), we are done
+ * with it. If any batch is pending/running for this "krcp", below
+ * per-cpu flush_rcu_work() waits its completion(see last step).
+ */
+ if (!need_offload_krc(krcp))
+ continue;
+
+ while (1) {
+ /*
+ * If we are not able to queue a new RCU work it means:
+ * - batches for this CPU are still in flight which should
+ * be flushed first and then repeat;
+ * - no objects to detach, because of concurrency.
+ */
+ queued = kvfree_rcu_queue_batch(krcp);
+
+ /*
+ * Bail out, if there is no need to offload this "krcp"
+ * anymore. As noted earlier it can run concurrently.
+ */
+ if (queued || !need_offload_krc(krcp))
+ break;
+
+ /* There are ongoing batches. */
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
+ flush_rcu_work(&krwp->rcu_work);
+ }
+ }
+ }
+
+ /*
+ * Now we guarantee that all objects are flushed.
+ */
+ for_each_possible_cpu(cpu) {
+ krcp = per_cpu_ptr(&krc, cpu);
+
+ /*
+ * A monitor work can drain ready to reclaim objects
+ * directly. Wait its completion if running or pending.
+ */
+ cancel_delayed_work_sync(&krcp->monitor_work);
+
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
+ flush_rcu_work(&krwp->rcu_work);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+
+static unsigned long
+kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count += krc_count(krcp);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
+ }
+
+ return count == 0 ? SHRINK_EMPTY : count;
+}
+
+static unsigned long
+kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu, freed = 0;
+
+ for_each_possible_cpu(cpu) {
+ int count;
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count = krc_count(krcp);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
+
+ sc->nr_to_scan -= count;
+ freed += count;
+
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ return freed == 0 ? SHRINK_STOP : freed;
+}
+
+void __init kvfree_rcu_init(void)
+{
+ int cpu;
+ int i, j;
+ struct shrinker *kfree_rcu_shrinker;
+
+ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_reclaim_wq);
+
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+ krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
+ }
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ krcp->initialized = true;
+ }
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
+}
+
+#endif /* CONFIG_KVFREE_RCU_BATCHED */
+
diff --git a/mm/slub.c b/mm/slub.c
index 19980419b176..31e11ef256f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,6 +19,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
@@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object)
set_orig_size(s, (void *)object, s->object_size);
}
-static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
+ va_copy(args, argsp);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
va_end(args);
}
+static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ __slab_bug(s, fmt, args);
+ va_end(args);
+}
+
__printf(2, 3)
-static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
-
- dump_stack();
}
static void object_err(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *reason)
+ u8 *object, const char *reason)
{
if (slab_add_kunit_errors())
return;
- slab_bug(s, "%s", reason);
+ slab_bug(s, reason);
print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
}
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
return false;
}
+static void __slab_err(struct slab *slab)
+{
+ if (slab_in_kunit_test())
+ return;
+
+ print_slab_info(slab);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
+}
+
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
- char buf[100];
if (slab_add_kunit_errors())
return;
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ __slab_bug(s, fmt, args);
va_end(args);
- slab_bug(s, "%s", buf);
- print_slab_info(slab);
- dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ __slab_err(slab);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
s->inuse - poison_size);
}
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
void *from, void *to)
{
slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
@@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static pad_check_attributes int
check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *what,
- u8 *start, unsigned int value, unsigned int bytes)
+ u8 *object, const char *what, u8 *start, unsigned int value,
+ unsigned int bytes, bool slab_obj_print)
{
u8 *fault;
u8 *end;
@@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
if (slab_add_kunit_errors())
goto skip_bug_print;
- slab_bug(s, "%s overwritten", what);
- pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault - addr,
- fault[0], value);
+ pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ what, fault, end - 1, fault - addr, fault[0], value);
+
+ if (slab_obj_print)
+ object_err(s, slab, object, "Object corrupt");
skip_bug_print:
restore_bytes(s, what, value, fault, end);
@@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
return 1;
return check_bytes_and_report(s, slab, p, "Object padding",
- p + off, POISON_INUSE, size_from_object(s) - off);
+ p + off, POISON_INUSE, size_from_object(s) - off, true);
}
/* Check the pad bytes at the end of a slab page */
@@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
- fault, end - 1, fault - start);
+ slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
+ __slab_err(slab);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
}
@@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
- object - s->red_left_pad, val, s->red_left_pad))
+ object - s->red_left_pad, val, s->red_left_pad, ret))
ret = 0;
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
- endobject, val, s->inuse - s->object_size))
+ endobject, val, s->inuse - s->object_size, ret))
ret = 0;
if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
@@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->object_size > orig_size &&
!check_bytes_and_report(s, slab, object,
"kmalloc Redzone", p + orig_size,
- val, s->object_size - orig_size)) {
+ val, s->object_size - orig_size, ret)) {
ret = 0;
}
}
@@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
if (!check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
- s->inuse - s->object_size))
+ s->inuse - s->object_size, ret))
ret = 0;
}
}
@@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (kasan_meta_size < s->object_size - 1 &&
!check_bytes_and_report(s, slab, p, "Poison",
p + kasan_meta_size, POISON_FREE,
- s->object_size - kasan_meta_size - 1))
+ s->object_size - kasan_meta_size - 1, ret))
ret = 0;
if (kasan_meta_size < s->object_size &&
!check_bytes_and_report(s, slab, p, "End Poison",
- p + s->object_size - 1, POISON_END, 1))
+ p + s->object_size - 1, POISON_END, 1, ret))
ret = 0;
}
/*
@@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
ret = 0;
}
- if (!ret && !slab_in_kunit_test()) {
- print_trailer(s, slab, object);
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- }
-
return ret;
}
@@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
* Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
+static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
@@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
fp = slab->freelist;
while (fp && nr <= slab->objects) {
if (fp == search)
- return 1;
+ return true;
if (!check_valid_pointer(s, slab, fp)) {
if (object) {
object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, slab, "Freepointer corrupt");
slab->freelist = NULL;
slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
- return 0;
+ return false;
}
- break;
}
object = fp;
fp = get_freepointer(s, object);
nr++;
}
+ if (nr > slab->objects) {
+ slab_err(s, slab, "Freelist cycle detected");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab_fix(s, "Freelist cleared");
+ return false;
+ }
+
max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s,
slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
} else if (!slab->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
- dump_stack();
- } else
+ slab_err(NULL, slab, "No slab cache for object 0x%p",
+ object);
+ } else {
object_err(s, slab, object,
- "page slab pointer corrupt.");
+ "page slab pointer corrupt.");
+ }
return 0;
}
return 1;
@@ -1950,6 +1973,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+ slab->obj_exts = 0;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2020,20 +2048,12 @@ static inline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
-static inline bool need_slab_obj_ext(void)
-{
- if (mem_alloc_profiling_enabled())
- return true;
+#else /* CONFIG_SLAB_OBJ_EXT */
- /*
- * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
- * inside memcg_slab_post_alloc_hook. No other users for now.
- */
- return false;
+static inline void init_slab_obj_exts(struct slab *slab)
+{
}
-#else /* CONFIG_SLAB_OBJ_EXT */
-
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2044,11 +2064,6 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
-static inline bool need_slab_obj_ext(void)
-{
- return false;
-}
-
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
@@ -2069,41 +2084,46 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
- WARN(alloc_slab_obj_exts(slab, s, flags, false),
- "%s, %s: Failed to create slab extension vector!\n",
- __func__, s->name))
+ alloc_slab_obj_exts(slab, s, flags, false)) {
+ pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
+ __func__, s->name);
return NULL;
+ }
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
-static inline void
-alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void
+__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext()) {
- struct slabobj_ext *obj_exts;
+ struct slabobj_ext *obj_exts;
- obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
- /*
- * Currently obj_exts is used only for allocation profiling.
- * If other users appear then mem_alloc_profiling_enabled()
- * check should be added before alloc_tag_add().
- */
- if (likely(obj_exts))
- alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
- }
+ obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
+ /*
+ * Currently obj_exts is used only for allocation profiling.
+ * If other users appear then mem_alloc_profiling_enabled()
+ * check should be added before alloc_tag_add().
+ */
+ if (likely(obj_exts))
+ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
}
static inline void
-alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
- int objects)
+alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (mem_alloc_profiling_enabled())
+ __alloc_tagging_slab_alloc_hook(s, object, flags);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void
+__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
{
struct slabobj_ext *obj_exts;
int i;
- if (!mem_alloc_profiling_enabled())
- return;
-
/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
return;
@@ -2119,6 +2139,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
}
}
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
+{
+ if (mem_alloc_profiling_enabled())
+ __alloc_tagging_slab_free_hook(s, slab, p, objects);
+}
+
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void
@@ -2189,9 +2217,24 @@ bool memcg_slab_post_charge(void *p, gfp_t flags)
folio = virt_to_folio(p);
if (!folio_test_slab(folio)) {
- return folio_memcg_kmem(folio) ||
- (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
- folio_order(folio)) == 0);
+ int size;
+
+ if (folio_memcg_kmem(folio))
+ return true;
+
+ if (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
+ folio_order(folio)))
+ return false;
+
+ /*
+ * This folio has already been accounted in the global stats but
+ * not in the memcg stats. So, subtract from the global and use
+ * the interface which adds to both global and memcg stats.
+ */
+ size = folio_size(folio);
+ node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size);
+ lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size);
+ return true;
}
slab = folio_slab(folio);
@@ -2296,7 +2339,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
* We have to do this manually because the rcu_head is
* not located inside the object.
*/
- kasan_record_aux_stack_noalloc(x);
+ kasan_record_aux_stack(x);
delayed_free->object = x;
call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
@@ -2405,17 +2448,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
unsigned int order = oo_order(oo);
if (node == NUMA_NO_NODE)
- folio = (struct folio *)alloc_pages(flags, order);
+ folio = (struct folio *)alloc_frozen_pages(flags, order);
else
- folio = (struct folio *)__alloc_pages_node(node, flags, order);
+ folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
if (!folio)
return NULL;
slab = folio_slab(folio);
__folio_set_slab(folio);
- /* Make the flag visible before any changes to folio->mapping */
- smp_wmb();
if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
@@ -2543,8 +2584,12 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_online() || need_slab_obj_ext())
- free_slab_obj_exts(slab);
+ /*
+ * The slab object extensions should now be freed regardless of
+ * whether mem_alloc_profiling_enabled() or not because profiling
+ * might have been disabled after slab->obj_exts got allocated.
+ */
+ free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
@@ -2588,6 +2633,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
+ init_slab_obj_exts(slab);
account_slab(slab, oo_order(oo), s, flags);
@@ -2636,12 +2682,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
__slab_clear_pfmemalloc(slab);
folio->mapping = NULL;
- /* Make the mapping reset visible before clearing the flag */
- smp_wmb();
__folio_clear_slab(folio);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
- __free_pages(&folio->page, order);
+ free_frozen_pages(&folio->page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -4230,6 +4274,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
+ __folio_set_large_kmalloc(folio);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -4705,6 +4750,11 @@ static void free_large_kmalloc(struct folio *folio, void *object)
{
unsigned int order = folio_order(folio);
+ if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
+ dump_page(&folio->page, "Not a kmalloc allocation");
+ return;
+ }
+
if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
@@ -4714,9 +4764,55 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
+ __folio_clear_large_kmalloc(folio);
folio_put(folio);
}
+/*
+ * Given an rcu_head embedded within an object obtained from kvmalloc at an
+ * offset < 4k, free the object in question.
+ */
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+ void *obj = head;
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *slab_addr;
+
+ if (is_vmalloc_addr(obj)) {
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ vfree(obj);
+ return;
+ }
+
+ folio = virt_to_folio(obj);
+ if (!folio_test_slab(folio)) {
+ /*
+ * rcu_head offset can be only less than page size so no need to
+ * consider folio order
+ */
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ free_large_kmalloc(folio, obj);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ slab_addr = folio_address(folio);
+
+ if (is_kfence_address(obj)) {
+ obj = kfence_object_start(obj);
+ } else {
+ unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+ obj = slab_addr + s->size * idx;
+ obj = fixup_red_left(s, obj);
+ }
+
+ slab_free(s, slab, obj, _RET_IP_);
+}
+
/**
* kfree - free previously allocated memory
* @object: pointer returned by kmalloc() or kmem_cache_alloc()
@@ -4867,6 +4963,170 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
}
EXPORT_SYMBOL(krealloc_noprof);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - i.e.
+ * do not direct reclaim unless physically continuous memory is preferred
+ * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
+ * start working in the background
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags &= ~__GFP_DIRECT_RECLAIM;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
+/**
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
+ kmalloc_gfp_adjust(flags, size),
+ node, _RET_IP_);
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ void *n;
+
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(kvrealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5559,14 +5819,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
- const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
void *p;
- slab_err(s, slab, text, s->name);
+ if (!slab_add_kunit_errors())
+ slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
spin_lock(&object_map_lock);
__fill_map(object_map, s, slab);
@@ -5581,6 +5841,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
}
}
spin_unlock(&object_map_lock);
+
+ __slab_err(slab);
#endif
}
@@ -5601,8 +5863,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, slab,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ list_slab_objects(s, slab);
}
}
spin_unlock_irq(&n->list_lock);
@@ -7498,10 +7759,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
return -ENOMEM;
}
- if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
- alloc = TRACK_ALLOC;
- else
- alloc = TRACK_FREE;
+ alloc = debugfs_get_aux_num(filep);
if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
bitmap_free(obj_map);
@@ -7557,11 +7815,11 @@ static void debugfs_slab_add(struct kmem_cache *s)
slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
- debugfs_create_file("alloc_traces", 0400,
- slab_cache_dir, s, &slab_debugfs_fops);
+ debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
+ TRACK_ALLOC, &slab_debugfs_fops);
- debugfs_create_file("free_traces", 0400,
- slab_cache_dir, s, &slab_debugfs_fops);
+ debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
+ TRACK_FREE, &slab_debugfs_fops);
}
void debugfs_slab_release(struct kmem_cache *s)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index cec67c5f37d8..fd2ab5118e13 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -30,6 +30,17 @@
#include <asm/dma.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * Flags for vmemmap_populate_range and friends.
+ */
+/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
+#define VMEMMAP_POPULATE_PAGEREF 0x0001
+
+#include "internal.h"
/*
* Allocate a block of memory to be used to back the virtual memory map
@@ -42,8 +53,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return memblock_alloc_try_nid_raw(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ return memmap_alloc(size, align, goal, node, false);
}
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
@@ -143,17 +153,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn, unsigned long flags)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
- if (!reuse) {
+ if (ptpfn == (unsigned long)-1) {
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
+ ptpfn = PHYS_PFN(__pa(p));
} else {
/*
* When a PTE/PMD entry is freed from the init_mm
@@ -164,10 +175,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
* and through vmemmap_populate_compound_pages() when
* slab is available.
*/
- get_page(reuse);
- p = page_to_virt(reuse);
+ if (flags & VMEMMAP_POPULATE_PAGEREF)
+ get_page(pfn_to_page(ptpfn));
}
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ entry = pfn_pte(ptpfn, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
return pte;
@@ -237,7 +248,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn,
+ unsigned long flags)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -257,7 +269,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return NULL;
- pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
if (!pte)
return NULL;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -268,13 +280,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
static int __meminit vmemmap_populate_range(unsigned long start,
unsigned long end, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn,
+ unsigned long flags)
{
unsigned long addr = start;
pte_t *pte;
for (; addr < end; addr += PAGE_SIZE) {
- pte = vmemmap_populate_address(addr, node, altmap, reuse);
+ pte = vmemmap_populate_address(addr, node, altmap,
+ ptpfn, flags);
if (!pte)
return -ENOMEM;
}
@@ -285,7 +299,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap)
{
- return vmemmap_populate_range(start, end, node, altmap, NULL);
+ return vmemmap_populate_range(start, end, node, altmap, -1, 0);
+}
+
+/*
+ * Undo populate_hvo, and replace it with a normal base page mapping.
+ * Used in memory init in case a HVO mapping needs to be undone.
+ *
+ * This can happen when it is discovered that a memblock allocated
+ * hugetlb page spans multiple zones, which can only be verified
+ * after zones have been initialized.
+ *
+ * We know that:
+ * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
+ * allocated through memblock, and mapped.
+ *
+ * 2) The rest of the vmemmap pages are mirrors of the last head page.
+ */
+int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ unsigned long maddr, pfn;
+ pte_t *pte;
+ int headpages;
+
+ /*
+ * Should only be called early in boot, so nothing will
+ * be accessing these page structures.
+ */
+ WARN_ON(!early_boot_irqs_disabled);
+
+ headpages = headsize >> PAGE_SHIFT;
+
+ /*
+ * Clear mirrored mappings for tail page structs.
+ */
+ for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ pte_clear(&init_mm, maddr, pte);
+ }
+
+ /*
+ * Clear and free mappings for head page and first tail page
+ * structs.
+ */
+ for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ pfn = pte_pfn(ptep_get(pte));
+ pte_clear(&init_mm, maddr, pte);
+ memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
+ }
+
+ flush_tlb_kernel_range(addr, end);
+
+ return vmemmap_populate(addr, end, node, NULL);
+}
+
+/*
+ * Write protect the mirrored tail page structs for HVO. This will be
+ * called from the hugetlb code when gathering and initializing the
+ * memblock allocated gigantic pages. The write protect can't be
+ * done earlier, since it can't be guaranteed that the reserved
+ * page structures will not be written to during initialization,
+ * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
+ *
+ * The PTEs are known to exist, and nothing else should be touching
+ * these pages. The caller is responsible for any TLB flushing.
+ */
+void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ unsigned long maddr;
+ pte_t *pte;
+
+ for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ ptep_set_wrprotect(&init_mm, maddr, pte);
+ }
+}
+
+/*
+ * Populate vmemmap pages HVO-style. The first page contains the head
+ * page and needed tail pages, the other ones are mirrors of the first
+ * page.
+ */
+int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ pte_t *pte;
+ unsigned long maddr;
+
+ for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
+ pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
+ if (!pte)
+ return -ENOMEM;
+ }
+
+ /*
+ * Reuse the last page struct page mapped above for the rest.
+ */
+ return vmemmap_populate_range(maddr, end, node, NULL,
+ pte_pfn(ptep_get(pte)), 0);
}
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
@@ -408,7 +522,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
* with just tail struct pages.
*/
return vmemmap_populate_range(start, end, node, NULL,
- pte_page(ptep_get(pte)));
+ pte_pfn(ptep_get(pte)),
+ VMEMMAP_POPULATE_PAGEREF);
}
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -416,13 +531,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
unsigned long next, last = addr + size;
/* Populate the head page vmemmap page */
- pte = vmemmap_populate_address(addr, node, NULL, NULL);
+ pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
/* Populate the tail pages vmemmap page */
next = addr + PAGE_SIZE;
- pte = vmemmap_populate_address(next, node, NULL, NULL);
+ pte = vmemmap_populate_address(next, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
@@ -432,7 +547,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
- pte_page(ptep_get(pte)));
+ pte_pfn(ptep_get(pte)),
+ VMEMMAP_POPULATE_PAGEREF);
if (rc)
return -ENOMEM;
}
@@ -469,3 +585,28 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
return pfn_to_page(pfn);
}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+/*
+ * This is called just before initializing sections for a NUMA node.
+ * Any special initialization that needs to be done before the
+ * generic initialization can be done from here. Sections that
+ * are initialized in hooks called from here will be skipped by
+ * the generic initialization.
+ */
+void __init sparse_vmemmap_init_nid_early(int nid)
+{
+ hugetlb_vmemmap_init_early(nid);
+}
+
+/*
+ * This is called just before the initialization of page structures
+ * through memmap_init. Zones are now initialized, so any work that
+ * needs to be done that needs zone information can be done from
+ * here.
+ */
+void __init sparse_vmemmap_init_nid_late(int nid)
+{
+ hugetlb_vmemmap_init_late(nid);
+}
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 13b6624d3562..3c012cf83cc2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -170,11 +170,6 @@ static void __section_mark_present(struct mem_section *ms,
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-#define for_each_present_section_nr(start, section_nr) \
- for (section_nr = next_present_section_nr(start-1); \
- section_nr != -1; \
- section_nr = next_present_section_nr(section_nr))
-
static inline unsigned long first_present_section_nr(void)
{
return next_present_section_nr(-1);
@@ -257,10 +252,7 @@ static void __init memblocks_present(void)
size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc(size, align);
- if (!mem_section)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, size, align);
+ mem_section = memblock_alloc_or_panic(size, align);
}
#endif
@@ -411,13 +403,13 @@ static void __init check_usemap_section_nr(int nid,
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static unsigned long __init section_map_size(void)
+unsigned long __init section_map_size(void)
{
return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}
#else
-static unsigned long __init section_map_size(void)
+unsigned long __init section_map_size(void)
{
return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
@@ -498,6 +490,44 @@ void __weak __meminit vmemmap_populate_print_last(void)
{
}
+static void *sparse_usagebuf __meminitdata;
+static void *sparse_usagebuf_end __meminitdata;
+
+/*
+ * Helper function that is used for generic section initialization, and
+ * can also be used by any hooks added above.
+ */
+void __init sparse_init_early_section(int nid, struct page *map,
+ unsigned long pnum, unsigned long flags)
+{
+ BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end);
+ check_usemap_section_nr(nid, sparse_usagebuf);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+ sparse_usagebuf, SECTION_IS_EARLY | flags);
+ sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size();
+}
+
+static int __init sparse_usage_init(int nid, unsigned long map_count)
+{
+ unsigned long size;
+
+ size = mem_section_usage_size() * map_count;
+ sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section(
+ NODE_DATA(nid), size);
+ if (!sparse_usagebuf) {
+ sparse_usagebuf_end = NULL;
+ return -ENOMEM;
+ }
+
+ sparse_usagebuf_end = sparse_usagebuf + size;
+ return 0;
+}
+
+static void __init sparse_usage_fini(void)
+{
+ sparse_usagebuf = sparse_usagebuf_end = NULL;
+}
+
/*
* Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
* And number of present sections in this node is map_count.
@@ -506,47 +536,54 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
- struct mem_section_usage *usage;
unsigned long pnum;
struct page *map;
+ struct mem_section *ms;
- usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
- mem_section_usage_size() * map_count);
- if (!usage) {
+ if (sparse_usage_init(nid, map_count)) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
+
sparse_buffer_init(map_count * section_map_size(), nid);
+
+ sparse_vmemmap_init_nid_early(nid);
+
for_each_present_section_nr(pnum_begin, pnum) {
unsigned long pfn = section_nr_to_pfn(pnum);
if (pnum >= pnum_end)
break;
- map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
- nid, NULL, NULL);
- if (!map) {
- pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
- __func__, nid);
- pnum_begin = pnum;
- sparse_buffer_fini();
- goto failed;
+ ms = __nr_to_section(pnum);
+ if (!preinited_vmemmap_section(ms)) {
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ sparse_usage_fini();
+ sparse_buffer_fini();
+ goto failed;
+ }
+ sparse_init_early_section(nid, map, pnum, 0);
}
- check_usemap_section_nr(nid, usage);
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
- SECTION_IS_EARLY);
- usage = (void *) usage + mem_section_usage_size();
}
+ sparse_usage_fini();
sparse_buffer_fini();
return;
failed:
- /* We failed to allocate, mark all the following pnums as not present */
+ /*
+ * We failed to allocate, mark all the following pnums as not present,
+ * except the ones already initialized earlier.
+ */
for_each_present_section_nr(pnum_begin, pnum) {
- struct mem_section *ms;
-
if (pnum >= pnum_end)
break;
ms = __nr_to_section(pnum);
+ if (!preinited_vmemmap_section(ms))
+ ms->section_mem_map = 0;
ms->section_mem_map = 0;
}
}
diff --git a/mm/swap.c b/mm/swap.c
index 10decd9dffa1..4fc322f7111a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -45,7 +45,7 @@
/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
-const int page_cluster_max = 31;
+static const int page_cluster_max = 31;
struct cpu_fbatches {
/*
@@ -109,7 +109,7 @@ void __folio_put(struct folio *folio)
page_cache_release(folio);
folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
- free_unref_page(&folio->page, folio_order(folio));
+ free_frozen_pages(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);
@@ -309,7 +309,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio)
trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}
#ifdef CONFIG_SMP
@@ -379,37 +379,58 @@ static void __lru_cache_activate_folio(struct folio *folio)
}
#ifdef CONFIG_LRU_GEN
-static void folio_inc_refs(struct folio *folio)
+
+static void lru_gen_inc_refs(struct folio *folio)
{
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
if (folio_test_unevictable(folio))
return;
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- folio_set_referenced(folio);
- return;
- }
-
- if (!folio_test_workingset(folio)) {
- folio_set_workingset(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
- /* see the comment on MAX_NR_TIERS */
do {
- new_flags = old_flags & LRU_REFS_MASK;
- if (new_flags == LRU_REFS_MASK)
- break;
+ if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) {
+ if (!folio_test_workingset(folio))
+ folio_set_workingset(folio);
+ return;
+ }
- new_flags += BIT(LRU_REFS_PGOFF);
- new_flags |= old_flags & ~LRU_REFS_MASK;
+ new_flags = old_flags + BIT(LRU_REFS_PGOFF);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
}
-#else
-static void folio_inc_refs(struct folio *folio)
+
+static bool lru_gen_clear_refs(struct folio *folio)
+{
+ struct lru_gen_folio *lrugen;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+
+ if (gen < 0)
+ return true;
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+
+ lrugen = &folio_lruvec(folio)->lrugen;
+ /* whether can do without shuffling under the LRU lock */
+ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type]));
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_inc_refs(struct folio *folio)
+{
+}
+
+static bool lru_gen_clear_refs(struct folio *folio)
{
+ return false;
}
+
#endif /* CONFIG_LRU_GEN */
/**
@@ -427,8 +448,10 @@ static void folio_inc_refs(struct folio *folio)
*/
void folio_mark_accessed(struct folio *folio)
{
+ if (folio_test_dropbehind(folio))
+ return;
if (lru_gen_enabled()) {
- folio_inc_refs(folio);
+ lru_gen_inc_refs(folio);
return;
}
@@ -474,7 +497,7 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- /* see the comment in lru_gen_add_folio() */
+ /* see the comment in lru_gen_folio_seq() */
if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
@@ -524,7 +547,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
*/
static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
{
- bool active = folio_test_active(folio);
+ bool active = folio_test_active(folio) || lru_gen_enabled();
long nr_pages = folio_nr_pages(folio);
if (folio_test_unevictable(folio))
@@ -558,7 +581,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
if (active) {
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
@@ -576,7 +599,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}
static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
@@ -589,7 +612,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
- folio_clear_referenced(folio);
+ if (lru_gen_enabled())
+ lru_gen_clear_refs(folio);
+ else
+ folio_clear_referenced(folio);
/*
* Lazyfree folios are clean anonymous folios. They have
* the swapbacked flag cleared, to distinguish them from normal
@@ -599,7 +625,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}
/*
@@ -657,6 +683,9 @@ void deactivate_file_folio(struct folio *folio)
if (folio_test_unevictable(folio))
return;
+ if (lru_gen_enabled() && lru_gen_clear_refs(folio))
+ return;
+
folio_batch_add_and_move(folio, lru_deactivate_file, true);
}
@@ -670,7 +699,10 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
+ if (folio_test_unevictable(folio))
+ return;
+
+ if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
return;
folio_batch_add_and_move(folio, lru_deactivate, true);
@@ -924,8 +956,6 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_folio_refs(folio, nr_refs))
- continue;
if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_folio(folio);
continue;
@@ -1044,6 +1074,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
+static const struct ctl_table swap_sysctl_table[] = {
+ {
+ .procname = "page-cluster",
+ .data = &page_cluster,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
+ }
+};
+
/*
* Perform any setup for the swap system
*/
@@ -1060,4 +1102,6 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+
+ register_sysctl_init("vm", swap_sysctl_table);
}
diff --git a/mm/swap.h b/mm/swap.h
index ad2f121de970..9096082a915e 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,6 +3,7 @@
#define _MM_SWAP_H
struct mempolicy;
+extern int page_cluster;
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
@@ -19,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct writeback_control *wbc);
void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
@@ -50,7 +51,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
}
void show_swap_cache_info(void);
-bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp);
@@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return find_next_bit(sis->zeromap, end, start) - start;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing mTHP swapin, we need to
+ * ensure all entries are not cached, otherwise, the mTHP folio will
+ * be in conflict with the folio in swap cache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
@@ -141,7 +160,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *f, struct writeback_control *wbc)
{
return 0;
}
@@ -163,11 +182,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
return filemap_get_folio(mapping, index);
}
-static inline bool add_to_swap(struct folio *folio)
-{
- return false;
-}
-
static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
return NULL;
@@ -204,6 +218,28 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return 0;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to. If you know
+ * the folio is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+ if (unlikely(folio_test_swapcache(folio)))
+ return swap_cache_index(folio->swap);
+ return folio->index;
+}
+
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index da1278f0563b..de779fed8c21 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -6,149 +6,106 @@
#include <linux/swapops.h> /* depends on mm.h include */
static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
- struct page **map;
- unsigned long length;
- spinlock_t lock;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */
+#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short))
+#define ID_SHIFT (BITS_PER_TYPE(unsigned short))
+#define ID_MASK (BIT(ID_SHIFT) - 1)
struct swap_cgroup {
- unsigned short id;
+ atomic_t ids;
};
-#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
-
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- * - we have no race in "exchange" when we're accessed via SwapCache because
- * SwapCache(and its swp_entry) is under lock.
- * - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
- struct page *page;
- struct swap_cgroup_ctrl *ctrl;
- unsigned long idx, max;
-
- ctrl = &swap_cgroup_ctrl[type];
-
- for (idx = 0; idx < ctrl->length; idx++) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
- goto not_enough_page;
- ctrl->map[idx] = page;
- if (!(idx % SWAP_CLUSTER_MAX))
- cond_resched();
- }
- return 0;
-not_enough_page:
- max = idx;
- for (idx = 0; idx < max; idx++)
- __free_page(ctrl->map[idx]);
+struct swap_cgroup_ctrl {
+ struct swap_cgroup *map;
+};
- return -ENOMEM;
-}
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
- pgoff_t offset)
+static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
+ pgoff_t offset)
{
- struct page *mappage;
- struct swap_cgroup *sc;
+ unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
+ unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids);
+
+ BUILD_BUG_ON(!is_power_of_2(ID_PER_SC));
+ BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t));
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
+ return (old_ids >> shift) & ID_MASK;
}
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
- struct swap_cgroup_ctrl **ctrlp)
+static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
+ pgoff_t offset,
+ unsigned short new_id)
{
- pgoff_t offset = swp_offset(ent);
- struct swap_cgroup_ctrl *ctrl;
-
- ctrl = &swap_cgroup_ctrl[swp_type(ent)];
- if (ctrlp)
- *ctrlp = ctrl;
- return __lookup_swap_cgroup(ctrl, offset);
+ unsigned short old_id;
+ struct swap_cgroup *sc = &map[offset / ID_PER_SC];
+ unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
+ unsigned int new_ids, old_ids = atomic_read(&sc->ids);
+
+ do {
+ old_id = (old_ids >> shift) & ID_MASK;
+ new_ids = (old_ids & ~(ID_MASK << shift));
+ new_ids |= ((unsigned int)new_id) << shift;
+ } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids));
+
+ return old_id;
}
/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries.
+ * These entries must belong to one single folio, and that folio
+ * must be being charged for swap space (swap out), and these
+ * entries must not have been charged
*
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
+ * @folio: the folio that the swap entry belongs to
+ * @id: mem_cgroup ID to be recorded
+ * @ent: the first swap entry to be recorded
*/
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new)
+void swap_cgroup_record(struct folio *folio, unsigned short id,
+ swp_entry_t ent)
{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned long flags;
- unsigned short retval;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- retval = sc->id;
- if (retval == old)
- sc->id = new;
- else
- retval = 0;
- spin_unlock_irqrestore(&ctrl->lock, flags);
- return retval;
+ unsigned int nr_ents = folio_nr_pages(folio);
+ struct swap_cgroup *map;
+ pgoff_t offset, end;
+ unsigned short old;
+
+ offset = swp_offset(ent);
+ end = offset + nr_ents;
+ map = swap_cgroup_ctrl[swp_type(ent)].map;
+
+ do {
+ old = __swap_cgroup_id_xchg(map, offset, id);
+ VM_BUG_ON(old);
+ } while (++offset != end);
}
/**
- * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * swap_cgroup_clear - clear mem_cgroup for a set of swap entries.
+ * These entries must be being uncharged from swap. They either
+ * belongs to one single folio in the swap cache (swap in for
+ * cgroup v1), or no longer have any users (slot freeing).
+ *
* @ent: the first swap entry to be recorded into
- * @id: mem_cgroup to be recorded
* @nr_ents: number of swap entries to be recorded
*
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
+ * Returns the existing old value.
*/
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
- unsigned int nr_ents)
+unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned short old;
- unsigned long flags;
- pgoff_t offset = swp_offset(ent);
- pgoff_t end = offset + nr_ents;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- old = sc->id;
- for (;;) {
- VM_BUG_ON(sc->id != old);
- sc->id = id;
- offset++;
- if (offset == end)
- break;
- if (offset % SC_PER_PAGE)
- sc++;
- else
- sc = __lookup_swap_cgroup(ctrl, offset);
- }
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ pgoff_t offset, end;
+ struct swap_cgroup *map;
+ unsigned short old, iter = 0;
+
+ offset = swp_offset(ent);
+ end = offset + nr_ents;
+ map = swap_cgroup_ctrl[swp_type(ent)].map;
+
+ do {
+ old = __swap_cgroup_id_xchg(map, offset, 0);
+ if (!iter)
+ iter = old;
+ VM_BUG_ON(iter != old);
+ } while (++offset != end);
return old;
}
@@ -161,39 +118,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
+ struct swap_cgroup_ctrl *ctrl;
+
if (mem_cgroup_disabled())
return 0;
- return lookup_swap_cgroup(ent, NULL)->id;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent));
}
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
- void *array;
- unsigned long length;
+ struct swap_cgroup *map;
struct swap_cgroup_ctrl *ctrl;
if (mem_cgroup_disabled())
return 0;
- length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
-
- array = vcalloc(length, sizeof(void *));
- if (!array)
+ BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=
+ sizeof(struct swap_cgroup));
+ map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) *
+ sizeof(struct swap_cgroup));
+ if (!map)
goto nomem;
ctrl = &swap_cgroup_ctrl[type];
mutex_lock(&swap_cgroup_mutex);
- ctrl->length = length;
- ctrl->map = array;
- spin_lock_init(&ctrl->lock);
- if (swap_cgroup_prepare(type)) {
- /* memory shortage */
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
- vfree(array);
- goto nomem;
- }
+ ctrl->map = map;
mutex_unlock(&swap_cgroup_mutex);
return 0;
@@ -205,8 +156,7 @@ nomem:
void swap_cgroup_swapoff(int type)
{
- struct page **map;
- unsigned long i, length;
+ struct swap_cgroup *map;
struct swap_cgroup_ctrl *ctrl;
if (mem_cgroup_disabled())
@@ -215,19 +165,8 @@ void swap_cgroup_swapoff(int type)
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
- length = ctrl->length;
ctrl->map = NULL;
- ctrl->length = 0;
mutex_unlock(&swap_cgroup_mutex);
- if (map) {
- for (i = 0; i < length; i++) {
- struct page *page = map[i];
- if (page)
- __free_page(page);
- if (!(i % SWAP_CLUSTER_MAX))
- cond_resched();
- }
- vfree(map);
- }
+ vfree(map);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
deleted file mode 100644
index 13ab3b771409..000000000000
--- a/mm/swap_slots.c
+++ /dev/null
@@ -1,353 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Manage cache of swap slots to be used for and returned from
- * swap.
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * We allocate the swap slots from the global pool and put
- * it into local per cpu caches. This has the advantage
- * of no needing to acquire the swap_info lock every time
- * we need a new slot.
- *
- * There is also opportunity to simply return the slot
- * to local caches without needing to acquire swap_info
- * lock. We do not reuse the returned slots directly but
- * move them back to the global pool in a batch. This
- * allows the slots to coalesce and reduce fragmentation.
- *
- * The swap entry allocated is marked with SWAP_HAS_CACHE
- * flag in map_count that prevents it from being allocated
- * again from the global pool.
- *
- * The swap slots cache is protected by a mutex instead of
- * a spin lock as when we search for slots with scan_swap_map,
- * we can possibly sleep.
- */
-
-#include <linux/swap_slots.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-
-static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
-static bool swap_slot_cache_active;
-bool swap_slot_cache_enabled;
-static bool swap_slot_cache_initialized;
-static DEFINE_MUTEX(swap_slots_cache_mutex);
-/* Serialize swap slots cache enable/disable operations */
-static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
-
-static void __drain_swap_slots_cache(unsigned int type);
-
-#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
-#define SLOTS_CACHE 0x1
-#define SLOTS_CACHE_RET 0x2
-
-static void deactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = false;
- __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-static void reactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = true;
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-/* Must not be called with cpu hot plug lock */
-void disable_swap_slots_cache_lock(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- swap_slot_cache_enabled = false;
- if (swap_slot_cache_initialized) {
- /* serialize with cpu hotplug operations */
- cpus_read_lock();
- __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
- cpus_read_unlock();
- }
-}
-
-static void __reenable_swap_slots_cache(void)
-{
- swap_slot_cache_enabled = has_usable_swap();
-}
-
-void reenable_swap_slots_cache_unlock(void)
-{
- __reenable_swap_slots_cache();
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-static bool check_cache_active(void)
-{
- long pages;
-
- if (!swap_slot_cache_enabled)
- return false;
-
- pages = get_nr_swap_pages();
- if (!swap_slot_cache_active) {
- if (pages > num_online_cpus() *
- THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
- reactivate_swap_slots_cache();
- goto out;
- }
-
- /* if global pool of slot caches too low, deactivate cache */
- if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
- deactivate_swap_slots_cache();
-out:
- return swap_slot_cache_active;
-}
-
-static int alloc_swap_slot_cache(unsigned int cpu)
-{
- struct swap_slots_cache *cache;
- swp_entry_t *slots, *slots_ret;
-
- /*
- * Do allocation outside swap_slots_cache_mutex
- * as kvzalloc could trigger reclaim and folio_alloc_swap,
- * which can lock swap_slots_cache_mutex.
- */
- slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots)
- return -ENOMEM;
-
- slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots_ret) {
- kvfree(slots);
- return -ENOMEM;
- }
-
- mutex_lock(&swap_slots_cache_mutex);
- cache = &per_cpu(swp_slots, cpu);
- if (cache->slots || cache->slots_ret) {
- /* cache already allocated */
- mutex_unlock(&swap_slots_cache_mutex);
-
- kvfree(slots);
- kvfree(slots_ret);
-
- return 0;
- }
-
- if (!cache->lock_initialized) {
- mutex_init(&cache->alloc_lock);
- spin_lock_init(&cache->free_lock);
- cache->lock_initialized = true;
- }
- cache->nr = 0;
- cache->cur = 0;
- cache->n_ret = 0;
- /*
- * We initialized alloc_lock and free_lock earlier. We use
- * !cache->slots or !cache->slots_ret to know if it is safe to acquire
- * the corresponding lock and use the cache. Memory barrier below
- * ensures the assumption.
- */
- mb();
- cache->slots = slots;
- cache->slots_ret = slots_ret;
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
- bool free_slots)
-{
- struct swap_slots_cache *cache;
- swp_entry_t *slots = NULL;
-
- cache = &per_cpu(swp_slots, cpu);
- if ((type & SLOTS_CACHE) && cache->slots) {
- mutex_lock(&cache->alloc_lock);
- swapcache_free_entries(cache->slots + cache->cur, cache->nr);
- cache->cur = 0;
- cache->nr = 0;
- if (free_slots && cache->slots) {
- kvfree(cache->slots);
- cache->slots = NULL;
- }
- mutex_unlock(&cache->alloc_lock);
- }
- if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
- spin_lock_irq(&cache->free_lock);
- swapcache_free_entries(cache->slots_ret, cache->n_ret);
- cache->n_ret = 0;
- if (free_slots && cache->slots_ret) {
- slots = cache->slots_ret;
- cache->slots_ret = NULL;
- }
- spin_unlock_irq(&cache->free_lock);
- kvfree(slots);
- }
-}
-
-static void __drain_swap_slots_cache(unsigned int type)
-{
- unsigned int cpu;
-
- /*
- * This function is called during
- * 1) swapoff, when we have to make sure no
- * left over slots are in cache when we remove
- * a swap device;
- * 2) disabling of swap slot cache, when we run low
- * on swap slots when allocating memory and need
- * to return swap slots to global pool.
- *
- * We cannot acquire cpu hot plug lock here as
- * this function can be invoked in the cpu
- * hot plug path:
- * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
- * -> memory allocation -> direct reclaim -> folio_alloc_swap
- * -> drain_swap_slots_cache
- *
- * Hence the loop over current online cpu below could miss cpu that
- * is being brought online but not yet marked as online.
- * That is okay as we do not schedule and run anything on a
- * cpu before it has been marked online. Hence, we will not
- * fill any swap slots in slots cache of such cpu.
- * There are no slots on such cpu that need to be drained.
- */
- for_each_online_cpu(cpu)
- drain_slots_cache_cpu(cpu, type, false);
-}
-
-static int free_slot_cache(unsigned int cpu)
-{
- mutex_lock(&swap_slots_cache_mutex);
- drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-void enable_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- if (!swap_slot_cache_initialized) {
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
-
- swap_slot_cache_initialized = true;
- }
-
- __reenable_swap_slots_cache();
-out_unlock:
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-/* called with swap slot cache's alloc lock held */
-static int refill_swap_slots_cache(struct swap_slots_cache *cache)
-{
- if (!use_swap_slot_cache)
- return 0;
-
- cache->cur = 0;
- if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
- cache->slots, 0);
-
- return cache->nr;
-}
-
-void free_swap_slot(swp_entry_t entry)
-{
- struct swap_slots_cache *cache;
-
- /* Large folio swap slot is not covered. */
- zswap_invalidate(entry);
-
- cache = raw_cpu_ptr(&swp_slots);
- if (likely(use_swap_slot_cache && cache->slots_ret)) {
- spin_lock_irq(&cache->free_lock);
- /* Swap slots cache may be deactivated before acquiring lock */
- if (!use_swap_slot_cache || !cache->slots_ret) {
- spin_unlock_irq(&cache->free_lock);
- goto direct_free;
- }
- if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
- /*
- * Return slots to global pool.
- * The current swap_map value is SWAP_HAS_CACHE.
- * Set it to 0 to indicate it is available for
- * allocation in global pool
- */
- swapcache_free_entries(cache->slots_ret, cache->n_ret);
- cache->n_ret = 0;
- }
- cache->slots_ret[cache->n_ret++] = entry;
- spin_unlock_irq(&cache->free_lock);
- } else {
-direct_free:
- swapcache_free_entries(&entry, 1);
- }
-}
-
-swp_entry_t folio_alloc_swap(struct folio *folio)
-{
- swp_entry_t entry;
- struct swap_slots_cache *cache;
-
- entry.val = 0;
-
- if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, &entry, folio_order(folio));
- goto out;
- }
-
- /*
- * Preemption is allowed here, because we may sleep
- * in refill_swap_slots_cache(). But it is safe, because
- * accesses to the per-CPU data structure are protected by the
- * mutex cache->alloc_lock.
- *
- * The alloc path here does not touch cache->slots_ret
- * so cache->free_lock is not taken.
- */
- cache = raw_cpu_ptr(&swp_slots);
-
- if (likely(check_cache_active() && cache->slots)) {
- mutex_lock(&cache->alloc_lock);
- if (cache->slots) {
-repeat:
- if (cache->nr) {
- entry = cache->slots[cache->cur];
- cache->slots[cache->cur++].val = 0;
- cache->nr--;
- } else if (refill_swap_slots_cache(cache)) {
- goto repeat;
- }
- }
- mutex_unlock(&cache->alloc_lock);
- if (entry.val)
- goto out;
- }
-
- get_swap_pages(1, &entry, 0);
-out:
- if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_folio(folio, entry);
- entry.val = 0;
- }
- return entry;
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0c0321b8ff7..c354435a0923 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -20,7 +20,6 @@
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
-#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
@@ -31,7 +30,6 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
@@ -85,7 +83,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
/*
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and private instead of mapping and index.
+ * but sets SwapCache flag and 'swap' instead of mapping and index.
*/
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
@@ -167,67 +165,6 @@ void __delete_from_swap_cache(struct folio *folio,
__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
-/**
- * add_to_swap - allocate swap space for a folio
- * @folio: folio we want to move to swap
- *
- * Allocate swap space for the folio and add the folio to the
- * swap cache.
- *
- * Context: Caller needs to hold the folio lock.
- * Return: Whether the folio was added to the swap cache.
- */
-bool add_to_swap(struct folio *folio)
-{
- swp_entry_t entry;
- int err;
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
-
- entry = folio_alloc_swap(folio);
- if (!entry.val)
- return false;
-
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache.
- */
- err = add_to_swap_cache(folio, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
- if (err)
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- goto fail;
- /*
- * Normally the folio will be dirtied in unmap because its
- * pte should be dirty. A special case is MADV_FREE page. The
- * page's pte could have dirty bit cleared but the folio's
- * SwapBacked flag is still set because clearing the dirty bit
- * and SwapBacked flag has no lock protected. For such folio,
- * unmap will not set dirty bit for it, so folio reclaim will
- * not write the folio out. This can cause data corruption when
- * the folio is swapped in later. Always setting the dirty flag
- * for the folio solves the problem.
- */
- folio_mark_dirty(folio);
-
- return true;
-
-fail:
- put_swap_folio(folio, entry);
- return false;
-}
-
/*
* This must be called only on folios that have
* been verified to be in the swap cache and locked.
@@ -270,9 +207,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
- curr >>= SWAP_ADDRESS_SPACE_SHIFT;
- curr++;
- curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
if (curr > end)
break;
}
@@ -296,13 +231,11 @@ void free_swap_cache(struct folio *folio)
}
/*
- * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * Freeing a folio and also freeing any swap cache associated with
+ * this folio if it is the last user.
*/
-void free_page_and_swap_cache(struct page *page)
+void free_folio_and_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
free_swap_cache(folio);
if (!is_huge_zero_folio(folio))
folio_put(folio);
@@ -317,7 +250,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
@@ -433,17 +365,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct folio *folio;
struct folio *new_folio = NULL;
struct folio *result = NULL;
void *shadow = NULL;
*new_page_allocated = false;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
-
for (;;) {
int err;
/*
@@ -458,13 +386,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Just skip read ahead for unused swap slot.
- * During swap_off when swap_slot_cache is disabled,
- * we have to handle the race between putting
- * swap entry in swap cache and marking swap slot
- * as SWAP_HAS_CACHE. That's done in later part of code or
- * else swap_off will be aborted if we return NULL.
*/
- if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ if (!swap_entry_swapped(si, entry))
goto put_and_return;
/*
@@ -522,7 +445,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- mem_cgroup_swapin_uncharge_swap(entry, 1);
+ memcg1_swapin(entry, 1);
if (shadow)
workingset_refault(new_folio, shadow);
@@ -539,7 +462,6 @@ fail_unlock:
put_swap_folio(new_folio, entry);
folio_unlock(new_folio);
put_and_return:
- put_swap_device(si);
if (!(*new_page_allocated) && new_folio)
folio_put(new_folio);
return result;
@@ -559,11 +481,16 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug)
{
+ struct swap_info_struct *si;
bool page_allocated;
struct mempolicy *mpol;
pgoff_t ilx;
struct folio *folio;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
+
mpol = get_vma_policy(vma, addr, 0, &ilx);
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
@@ -571,6 +498,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (page_allocated)
swap_read_folio(folio, plug);
+
+ put_swap_device(si);
return folio;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0a9071cfe1d..68ce283e84be 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,7 +37,6 @@
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
-#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
@@ -53,15 +52,15 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
- unsigned int nr_pages);
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
+static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
-static struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si, unsigned long offset);
-static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
- struct swap_cluster_info *ci);
+static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset);
+static inline void unlock_cluster(struct swap_cluster_info *ci);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -116,6 +115,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+struct percpu_swap_cluster {
+ struct swap_info_struct *si[SWAP_NR_ORDERS];
+ unsigned long offset[SWAP_NR_ORDERS];
+ local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+ .si = { NULL },
+ .offset = { SWAP_ENTRY_INVALID },
+ .lock = INIT_LOCAL_LOCK(),
+};
+
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
if (type >= MAX_SWAPFILES)
@@ -129,6 +140,26 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/*
+ * Use the second highest bit of inuse_pages counter as the indicator
+ * if one swap device is on the available plist, so the atomic can
+ * still be updated arithmetically while having special data embedded.
+ *
+ * inuse_pages counter is the only thing indicating if a device should
+ * be on avail_lists or not (except swapon / swapoff). By embedding the
+ * off-list bit in the atomic counter, updates no longer need any lock
+ * to check the list status.
+ *
+ * This bit will be set if the device is not on the plist and not
+ * usable, will be cleared if the device is on the plist.
+ */
+#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
+#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
+static long swap_usage_in_pages(struct swap_info_struct *si)
+{
+ return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
+}
+
/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY 0x1
/*
@@ -138,10 +169,8 @@ static inline unsigned char swap_count(unsigned char ent)
#define TTRS_UNMAPPED 0x2
/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL 0x4
-/* Reclaim directly, bypass the slot cache and don't touch device lock */
-#define TTRS_DIRECT 0x8
-static bool swap_is_has_cache(struct swap_info_struct *si,
+static bool swap_only_has_cache(struct swap_info_struct *si,
unsigned long offset, int nr_pages)
{
unsigned char *map = si->swap_map + offset;
@@ -163,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
unsigned char count = *map;
- if (swap_count(count) != 1)
+ if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
return false;
while (++map < map_end) {
@@ -190,6 +219,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
int ret, nr_pages;
bool need_reclaim;
+again:
folio = filemap_get_folio(address_space, swap_cache_index(entry));
if (IS_ERR(folio))
return 0;
@@ -207,8 +237,16 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
if (!folio_trylock(folio))
goto out;
- /* offset could point to the middle of a large folio */
+ /*
+ * Offset could point to the middle of a large folio, or folio
+ * may no longer point to the expected offset before it's locked.
+ */
entry = folio->swap;
+ if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
offset = swp_offset(entry);
need_reclaim = ((flags & TTRS_ANYWAY) ||
@@ -222,32 +260,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* swap_map is HAS_CACHE only, which means the slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
- ci = lock_cluster_or_swap_info(si, offset);
- need_reclaim = swap_is_has_cache(si, offset, nr_pages);
- unlock_cluster_or_swap_info(si, ci);
+ ci = lock_cluster(si, offset);
+ need_reclaim = swap_only_has_cache(si, offset, nr_pages);
+ unlock_cluster(ci);
if (!need_reclaim)
goto out_unlock;
- if (!(flags & TTRS_DIRECT)) {
- /* Free through slot cache */
- delete_from_swap_cache(folio);
- folio_set_dirty(folio);
- ret = nr_pages;
- goto out_unlock;
- }
-
- xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
- folio_ref_sub(folio, nr_pages);
+ delete_from_swap_cache(folio);
folio_set_dirty(folio);
-
- spin_lock(&si->lock);
- /* Only sinple page folio can be backed by zswap */
- if (nr_pages == 1)
- zswap_invalidate(entry);
- swap_entry_range_free(si, entry, nr_pages);
- spin_unlock(&si->lock);
ret = nr_pages;
out_unlock:
folio_unlock(folio);
@@ -382,9 +402,23 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#endif
#define LATENCY_LIMIT 256
-static inline bool cluster_is_free(struct swap_cluster_info *info)
+static inline bool cluster_is_empty(struct swap_cluster_info *info)
+{
+ return info->count == 0;
+}
+
+static inline bool cluster_is_discard(struct swap_cluster_info *info)
+{
+ return info->flags == CLUSTER_FLAG_DISCARD;
+}
+
+static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
- return info->flags & CLUSTER_FLAG_FREE;
+ if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
+ return false;
+ if (!order)
+ return true;
+ return cluster_is_empty(ci) || order == ci->order;
}
static inline unsigned int cluster_index(struct swap_info_struct *si,
@@ -393,6 +427,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info;
}
+static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ return &si->cluster_info[offset / SWAPFILE_CLUSTER];
+}
+
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
@@ -404,105 +444,134 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si
{
struct swap_cluster_info *ci;
- ci = si->cluster_info;
- if (ci) {
- ci += offset / SWAPFILE_CLUSTER;
- spin_lock(&ci->lock);
- }
+ ci = offset_to_cluster(si, offset);
+ spin_lock(&ci->lock);
+
return ci;
}
static inline void unlock_cluster(struct swap_cluster_info *ci)
{
- if (ci)
- spin_unlock(&ci->lock);
+ spin_unlock(&ci->lock);
}
-/*
- * Determine the locking method in use for this device. Return
- * swap_cluster_info if SSD-style cluster-based locking is in place.
- */
-static inline struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si, unsigned long offset)
+static void move_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, struct list_head *list,
+ enum swap_cluster_flags new_flags)
{
- struct swap_cluster_info *ci;
+ VM_WARN_ON(ci->flags == new_flags);
- /* Try to use fine-grained SSD-style locking if available: */
- ci = lock_cluster(si, offset);
- /* Otherwise, fall back to traditional, coarse locking: */
- if (!ci)
- spin_lock(&si->lock);
-
- return ci;
-}
+ BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
+ lockdep_assert_held(&ci->lock);
-static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
- struct swap_cluster_info *ci)
-{
- if (ci)
- unlock_cluster(ci);
+ spin_lock(&si->lock);
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ list_add_tail(&ci->list, list);
else
- spin_unlock(&si->lock);
+ list_move_tail(&ci->list, list);
+ spin_unlock(&si->lock);
+
+ if (ci->flags == CLUSTER_FLAG_FRAG)
+ atomic_long_dec(&si->frag_cluster_nr[ci->order]);
+ else if (new_flags == CLUSTER_FLAG_FRAG)
+ atomic_long_inc(&si->frag_cluster_nr[ci->order]);
+ ci->flags = new_flags;
}
/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
- unsigned int idx = cluster_index(si, ci);
- /*
- * If scan_swap_map_slots() can't find a free cluster, it will check
- * si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
- * It will be cleared after discard
- */
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- SWAP_MAP_BAD, SWAPFILE_CLUSTER);
-
- VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
- list_move_tail(&ci->list, &si->discard_clusters);
- ci->flags = 0;
+ VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
+ move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
schedule_work(&si->discard_work);
}
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&si->lock);
lockdep_assert_held(&ci->lock);
-
- if (ci->flags)
- list_move_tail(&ci->list, &si->free_clusters);
- else
- list_add_tail(&ci->list, &si->free_clusters);
- ci->flags = CLUSTER_FLAG_FREE;
+ move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
/*
+ * Isolate and lock the first cluster that is not contented on a list,
+ * clean its flag before taken off-list. Cluster flag must be in sync
+ * with list status, so cluster updaters can always know the cluster
+ * list status without touching si lock.
+ *
+ * Note it's possible that all clusters on a list are contented so
+ * this returns NULL for an non-empty list.
+ */
+static struct swap_cluster_info *isolate_lock_cluster(
+ struct swap_info_struct *si, struct list_head *list)
+{
+ struct swap_cluster_info *ci, *ret = NULL;
+
+ spin_lock(&si->lock);
+
+ if (unlikely(!(si->flags & SWP_WRITEOK)))
+ goto out;
+
+ list_for_each_entry(ci, list, list) {
+ if (!spin_trylock(&ci->lock))
+ continue;
+
+ /* We may only isolate and clear flags of following lists */
+ VM_BUG_ON(!ci->flags);
+ VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
+ ci->flags != CLUSTER_FLAG_FULL);
+
+ list_del(&ci->list);
+ ci->flags = CLUSTER_FLAG_NONE;
+ ret = ci;
+ break;
+ }
+out:
+ spin_unlock(&si->lock);
+
+ return ret;
+}
+
+/*
* Doing discard actually. After a cluster discard is finished, the cluster
- * will be added to free cluster list. caller should hold si->lock.
-*/
-static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ * will be added to free cluster list. Discard cluster is a bit special as
+ * they don't participate in allocation or reclaim, so clusters marked as
+ * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
+ */
+static bool swap_do_scheduled_discard(struct swap_info_struct *si)
{
struct swap_cluster_info *ci;
+ bool ret = false;
unsigned int idx;
+ spin_lock(&si->lock);
while (!list_empty(&si->discard_clusters)) {
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
+ /*
+ * Delete the cluster from list to prepare for discard, but keep
+ * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
+ * pointing to it, or ran into by relocate_cluster.
+ */
list_del(&ci->list);
idx = cluster_index(si, ci);
spin_unlock(&si->lock);
-
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
SWAPFILE_CLUSTER);
- spin_lock(&si->lock);
spin_lock(&ci->lock);
+ /*
+ * Discard is done, clear its flags as it's off-list, then
+ * return the cluster to allocation list.
+ */
+ ci->flags = CLUSTER_FLAG_NONE;
__free_cluster(si, ci);
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- 0, SWAPFILE_CLUSTER);
spin_unlock(&ci->lock);
+ ret = true;
+ spin_lock(&si->lock);
}
+ spin_unlock(&si->lock);
+ return ret;
}
static void swap_discard_work(struct work_struct *work)
@@ -511,9 +580,7 @@ static void swap_discard_work(struct work_struct *work)
si = container_of(work, struct swap_info_struct, discard_work);
- spin_lock(&si->lock);
swap_do_scheduled_discard(si);
- spin_unlock(&si->lock);
}
static void swap_users_ref_free(struct percpu_ref *ref)
@@ -524,15 +591,16 @@ static void swap_users_ref_free(struct percpu_ref *ref)
complete(&si->comp);
}
+/*
+ * Must be called after freeing if ci->count == 0, moves the cluster to free
+ * or discard list.
+ */
static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
VM_BUG_ON(ci->count != 0);
- lockdep_assert_held(&si->lock);
+ VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
lockdep_assert_held(&ci->lock);
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
-
/*
* If the swap is discardable, prepare discard the cluster
* instead of free it immediately. The cluster will be freed
@@ -548,6 +616,49 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
}
/*
+ * Must be called after freeing if ci->count != 0, moves the cluster to
+ * nonfull list.
+ */
+static void partial_free_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
+ lockdep_assert_held(&ci->lock);
+
+ if (ci->flags != CLUSTER_FLAG_NONFULL)
+ move_cluster(si, ci, &si->nonfull_clusters[ci->order],
+ CLUSTER_FLAG_NONFULL);
+}
+
+/*
+ * Must be called after allocation, moves the cluster to full or frag list.
+ * Note: allocation doesn't acquire si lock, and may drop the ci lock for
+ * reclaim, so the cluster could be any where when called.
+ */
+static void relocate_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ lockdep_assert_held(&ci->lock);
+
+ /* Discard cluster must remain off-list or on discard list */
+ if (cluster_is_discard(ci))
+ return;
+
+ if (!ci->count) {
+ if (ci->flags != CLUSTER_FLAG_FREE)
+ free_cluster(si, ci);
+ } else if (ci->count != SWAPFILE_CLUSTER) {
+ if (ci->flags != CLUSTER_FLAG_FRAG)
+ move_cluster(si, ci, &si->frag_clusters[ci->order],
+ CLUSTER_FLAG_FRAG);
+ } else {
+ if (ci->flags != CLUSTER_FLAG_FULL)
+ move_cluster(si, ci, &si->full_clusters,
+ CLUSTER_FLAG_FULL);
+ }
+}
+
+/*
* The cluster corresponding to page_nr will be used. The cluster will not be
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
@@ -558,9 +669,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si,
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
struct swap_cluster_info *ci;
- if (!cluster_info)
- return;
-
ci = cluster_info + idx;
ci->count++;
@@ -568,63 +676,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si,
VM_BUG_ON(ci->flags);
}
-/*
- * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
- * which means no page in the cluster is in use, we can optionally discard
- * the cluster and add it to free cluster list.
- */
-static void dec_cluster_info_page(struct swap_info_struct *si,
- struct swap_cluster_info *ci, int nr_pages)
-{
- if (!si->cluster_info)
- return;
-
- VM_BUG_ON(ci->count < nr_pages);
- VM_BUG_ON(cluster_is_free(ci));
- lockdep_assert_held(&si->lock);
- lockdep_assert_held(&ci->lock);
- ci->count -= nr_pages;
-
- if (!ci->count) {
- free_cluster(si, ci);
- return;
- }
-
- if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
- VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
- list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]);
- ci->flags = CLUSTER_FLAG_NONFULL;
- }
-}
-
static bool cluster_reclaim_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
unsigned long start, unsigned long end)
{
unsigned char *map = si->swap_map;
- unsigned long offset;
+ unsigned long offset = start;
+ int nr_reclaim;
spin_unlock(&ci->lock);
- spin_unlock(&si->lock);
-
- for (offset = start; offset < end; offset++) {
+ do {
switch (READ_ONCE(map[offset])) {
case 0:
- continue;
+ offset++;
+ break;
case SWAP_HAS_CACHE:
- if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
- continue;
- goto out;
+ nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ if (nr_reclaim > 0)
+ offset += nr_reclaim;
+ else
+ goto out;
+ break;
default:
goto out;
}
- }
+ } while (offset < end);
out:
- spin_lock(&si->lock);
spin_lock(&ci->lock);
-
/*
* Recheck the range no matter reclaim succeeded or not, the slot
* could have been be freed while we are not holding the lock.
@@ -638,11 +716,14 @@ out:
static bool cluster_scan_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long start, unsigned int nr_pages)
+ unsigned long start, unsigned int nr_pages,
+ bool *need_reclaim)
{
unsigned long offset, end = start + nr_pages;
unsigned char *map = si->swap_map;
- bool need_reclaim = false;
+
+ if (cluster_is_empty(ci))
+ return true;
for (offset = start; offset < end; offset++) {
switch (READ_ONCE(map[offset])) {
@@ -651,16 +732,13 @@ static bool cluster_scan_range(struct swap_info_struct *si,
case SWAP_HAS_CACHE:
if (!vm_swap_full())
return false;
- need_reclaim = true;
+ *need_reclaim = true;
continue;
default:
return false;
}
}
- if (need_reclaim)
- return cluster_reclaim_range(si, ci, start, end);
-
return true;
}
@@ -670,76 +748,83 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
{
unsigned int nr_pages = 1 << order;
+ lockdep_assert_held(&ci->lock);
+
if (!(si->flags & SWP_WRITEOK))
return false;
- if (cluster_is_free(ci)) {
- if (nr_pages < SWAPFILE_CLUSTER) {
- list_move_tail(&ci->list, &si->nonfull_clusters[order]);
- ci->flags = CLUSTER_FLAG_NONFULL;
- }
+ /*
+ * The first allocation in a cluster makes the
+ * cluster exclusive to this order
+ */
+ if (cluster_is_empty(ci))
ci->order = order;
- }
memset(si->swap_map + start, usage, nr_pages);
- swap_range_alloc(si, start, nr_pages);
+ swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
- if (ci->count == SWAPFILE_CLUSTER) {
- VM_BUG_ON(!(ci->flags &
- (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
- list_move_tail(&ci->list, &si->full_clusters);
- ci->flags = CLUSTER_FLAG_FULL;
- }
-
return true;
}
-static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
- unsigned int *foundp, unsigned int order,
+/* Try use a new cluster for current CPU and allocate from it. */
+static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset,
+ unsigned int order,
unsigned char usage)
{
- unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
+ unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
+ unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
unsigned int nr_pages = 1 << order;
- struct swap_cluster_info *ci;
+ bool need_reclaim, ret;
- if (end < nr_pages)
- return SWAP_NEXT_INVALID;
- end -= nr_pages;
+ lockdep_assert_held(&ci->lock);
- ci = lock_cluster(si, offset);
- if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
+ if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
+ goto out;
- while (offset <= end) {
- if (cluster_scan_range(si, ci, offset, nr_pages)) {
- if (!cluster_alloc_range(si, ci, offset, usage, order)) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
- *foundp = offset;
- if (ci->count == SWAPFILE_CLUSTER) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
- offset += nr_pages;
- break;
+ for (end -= nr_pages; offset <= end; offset += nr_pages) {
+ need_reclaim = false;
+ if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
+ continue;
+ if (need_reclaim) {
+ ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages);
+ /*
+ * Reclaim drops ci->lock and cluster could be used
+ * by another order. Not checking flag as off-list
+ * cluster has no flag set, and change of list
+ * won't cause fragmentation.
+ */
+ if (!cluster_is_usable(ci, order))
+ goto out;
+ if (cluster_is_empty(ci))
+ offset = start;
+ /* Reclaim failed but cluster is usable, try next */
+ if (!ret)
+ continue;
}
+ if (!cluster_alloc_range(si, ci, offset, usage, order))
+ break;
+ found = offset;
offset += nr_pages;
+ if (ci->count < SWAPFILE_CLUSTER && offset <= end)
+ next = offset;
+ break;
}
- if (offset > end)
- offset = SWAP_NEXT_INVALID;
-done:
+out:
+ relocate_cluster(si, ci);
unlock_cluster(ci);
- return offset;
+ if (si->flags & SWP_SOLIDSTATE) {
+ this_cpu_write(percpu_swap_cluster.offset[order], next);
+ this_cpu_write(percpu_swap_cluster.si[order], si);
+ } else {
+ si->global_cluster->next[order] = next;
+ }
+ return found;
}
-/* Return true if reclaimed a whole cluster */
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -749,20 +834,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
int nr_reclaim;
if (force)
- to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+ to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while (!list_empty(&si->full_clusters)) {
- ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
- list_move_tail(&ci->list, &si->full_clusters);
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
- spin_unlock(&si->lock);
while (offset < end) {
if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+ spin_unlock(&ci->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
- TTRS_ANYWAY | TTRS_DIRECT);
+ TTRS_ANYWAY);
+ spin_lock(&ci->lock);
if (nr_reclaim) {
offset += abs(nr_reclaim);
continue;
@@ -770,8 +854,12 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
}
offset++;
}
- spin_lock(&si->lock);
+ /* in case no swap cache is reclaimed */
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ relocate_cluster(si, ci);
+
+ unlock_cluster(ci);
if (to_scan <= 0)
break;
}
@@ -783,42 +871,54 @@ static void swap_reclaim_work(struct work_struct *work)
si = container_of(work, struct swap_info_struct, reclaim_work);
- spin_lock(&si->lock);
swap_reclaim_full_clusters(si, true);
- spin_unlock(&si->lock);
}
/*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
*/
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned char usage)
{
- struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- unsigned int offset, found = 0;
+ unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
-new_cluster:
- lockdep_assert_held(&si->lock);
- cluster = this_cpu_ptr(si->percpu_cluster);
- offset = cluster->next[order];
- if (offset) {
- offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
+ /*
+ * Swapfile is not block device so unable
+ * to allocate large entries.
+ */
+ if (order && !(si->flags & SWP_BLKDEV))
+ return 0;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ /* Serialize HDD SWAP allocation for each device. */
+ spin_lock(&si->global_cluster_lock);
+ offset = si->global_cluster->next[order];
+ if (offset == SWAP_ENTRY_INVALID)
+ goto new_cluster;
+
+ ci = lock_cluster(si, offset);
+ /* Cluster could have been used by another order */
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset,
+ order, usage);
+ } else {
+ unlock_cluster(ci);
+ }
if (found)
goto done;
}
- if (!list_empty(&si->free_clusters)) {
- ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
- /*
- * Either we didn't touch the cluster due to swapoff,
- * or the allocation must success.
- */
- VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
- goto done;
+new_cluster:
+ ci = isolate_lock_cluster(si, &si->free_clusters);
+ if (ci) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
+ if (found)
+ goto done;
}
/* Try reclaim from full clusters if free clusters list is drained */
@@ -826,56 +926,42 @@ new_cluster:
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0;
-
- while (!list_empty(&si->nonfull_clusters[order])) {
- ci = list_first_entry(&si->nonfull_clusters[order],
- struct swap_cluster_info, list);
- list_move_tail(&ci->list, &si->frag_clusters[order]);
- ci->flags = CLUSTER_FLAG_FRAG;
- si->frag_cluster_nr[order]++;
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, order, usage);
- frags++;
+ unsigned int frags = 0, frags_existing;
+
+ while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
if (found)
- break;
+ goto done;
+ /* Clusters failed to allocate are moved to frag_clusters */
+ frags++;
}
- if (!found) {
+ frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
+ while (frags < frags_existing &&
+ (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
+ atomic_long_dec(&si->frag_cluster_nr[order]);
/*
- * Nonfull clusters are moved to frag tail if we reached
- * here, count them too, don't over scan the frag list.
+ * Rotate the frag list to iterate, they were all
+ * failing high order allocation or moved here due to
+ * per-CPU usage, but they could contain newly released
+ * reclaimable (eg. lazy-freed swap cache) slots.
*/
- while (frags < si->frag_cluster_nr[order]) {
- ci = list_first_entry(&si->frag_clusters[order],
- struct swap_cluster_info, list);
- /*
- * Rotate the frag list to iterate, they were all failing
- * high order allocation or moved here due to per-CPU usage,
- * this help keeping usable cluster ahead.
- */
- list_move_tail(&ci->list, &si->frag_clusters[order]);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, order, usage);
- frags++;
- if (found)
- break;
- }
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
+ if (found)
+ goto done;
+ frags++;
}
}
- if (found)
- goto done;
-
- if (!list_empty(&si->discard_clusters)) {
- /*
- * we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them, then
- * reread cluster_next_cpu since we dropped si->lock
- */
- swap_do_scheduled_discard(si);
+ /*
+ * We don't have free cluster but have some clusters in
+ * discarding, do discard now and reclaim them, then
+ * reread cluster_next_cpu since we dropped si->lock
+ */
+ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
goto new_cluster;
- }
if (order)
goto done;
@@ -886,74 +972,149 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while (!list_empty(&si->frag_clusters[o])) {
- ci = list_first_entry(&si->frag_clusters[o],
- struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, 0, usage);
+ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
+ atomic_long_dec(&si->frag_cluster_nr[o]);
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ 0, usage);
if (found)
goto done;
}
- while (!list_empty(&si->nonfull_clusters[o])) {
- ci = list_first_entry(&si->nonfull_clusters[o],
- struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, 0, usage);
+ while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ 0, usage);
if (found)
goto done;
}
}
-
done:
- cluster->next[order] = offset;
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_unlock(&si->global_cluster_lock);
return found;
}
-static void __del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
+static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
int nid;
+ unsigned long pages;
+
+ spin_lock(&swap_avail_lock);
+
+ if (swapoff) {
+ /*
+ * Forcefully remove it. Clear the SWP_WRITEOK flags for
+ * swapoff here so it's synchronized by both si->lock and
+ * swap_avail_lock, to ensure the result can be seen by
+ * add_to_avail_list.
+ */
+ lockdep_assert_held(&si->lock);
+ si->flags &= ~SWP_WRITEOK;
+ atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+ } else {
+ /*
+ * If not called by swapoff, take it off-list only if it's
+ * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
+ * si->inuse_pages == pages), any concurrent slot freeing,
+ * or device already removed from plist by someone else
+ * will make this return false.
+ */
+ pages = si->pages;
+ if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
+ pages | SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+ }
- assert_spin_locked(&si->lock);
for_each_node(nid)
plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
+ spin_unlock(&swap_avail_lock);
}
-static void del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
+static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
+ int nid;
+ long val;
+ unsigned long pages;
+
spin_lock(&swap_avail_lock);
- __del_from_avail_list(si);
+
+ /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
+ if (swapon) {
+ lockdep_assert_held(&si->lock);
+ si->flags |= SWP_WRITEOK;
+ } else {
+ if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
+ goto skip;
+ }
+
+ if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+
+ val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+
+ /*
+ * When device is full and device is on the plist, only one updater will
+ * see (inuse_pages == si->pages) and will call del_from_avail_list. If
+ * that updater happen to be here, just skip adding.
+ */
+ pages = si->pages;
+ if (val == pages) {
+ /* Just like the cmpxchg in del_from_avail_list */
+ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
+ pages | SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+ }
+
+ for_each_node(nid)
+ plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
spin_unlock(&swap_avail_lock);
}
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
- unsigned int nr_entries)
+/*
+ * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
+ * within each cluster, so the total contribution to the global counter should
+ * always be positive and cannot exceed the total number of usable slots.
+ */
+static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
{
- unsigned int end = offset + nr_entries - 1;
-
- if (offset == si->lowest_bit)
- si->lowest_bit += nr_entries;
- if (end == si->highest_bit)
- WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- del_from_avail_list(si);
+ long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
- if (si->cluster_info && vm_swap_full())
- schedule_work(&si->reclaim_work);
+ /*
+ * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
+ * remove it from the plist.
+ */
+ if (unlikely(val == si->pages)) {
+ del_from_avail_list(si, false);
+ return true;
}
+
+ return false;
}
-static void add_to_avail_list(struct swap_info_struct *si)
+static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
{
- int nid;
+ long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
- spin_lock(&swap_avail_lock);
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
- spin_unlock(&swap_avail_lock);
+ /*
+ * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
+ * add it to the plist.
+ */
+ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
+ add_to_avail_list(si, false);
+}
+
+static void swap_range_alloc(struct swap_info_struct *si,
+ unsigned int nr_entries)
+{
+ if (swap_usage_add(si, nr_entries)) {
+ if (vm_swap_full())
+ schedule_work(&si->reclaim_work);
+ }
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -968,18 +1129,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
* Use atomic clear_bit operations only on zeromap instead of non-atomic
* bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
clear_bit(offset + i, si->zeromap);
-
- if (offset < si->lowest_bit)
- si->lowest_bit = offset;
- if (end > si->highest_bit) {
- bool was_full = !si->highest_bit;
-
- WRITE_ONCE(si->highest_bit, end);
- if (was_full && (si->flags & SWP_WRITEOK))
- add_to_avail_list(si);
+ zswap_invalidate(swp_entry(si->type, offset + i));
}
+
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -999,324 +1153,87 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
*/
smp_wmb();
atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
+ swap_usage_sub(si, nr_entries);
}
-static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+static bool get_swap_device_info(struct swap_info_struct *si)
{
- unsigned long prev;
-
- if (!(si->flags & SWP_SOLIDSTATE)) {
- si->cluster_next = next;
- return;
- }
-
- prev = this_cpu_read(*si->cluster_next_cpu);
+ if (!percpu_ref_tryget_live(&si->users))
+ return false;
/*
- * Cross the swap address space size aligned trunk, choose
- * another trunk randomly to avoid lock contention on swap
- * address space if possible.
+ * Guarantee the si->users are checked before accessing other
+ * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
+ * up to dated.
+ *
+ * Paired with the spin_unlock() after setup_swap_info() in
+ * enable_swap_info(), and smp_wmb() in swapoff.
*/
- if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
- (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
- /* No free swap slots available */
- if (si->highest_bit <= si->lowest_bit)
- return;
- next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
- next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
- next = max_t(unsigned int, next, si->lowest_bit);
- }
- this_cpu_write(*si->cluster_next_cpu, next);
-}
-
-static bool swap_offset_available_and_locked(struct swap_info_struct *si,
- unsigned long offset)
-{
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- return true;
- }
-
- if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- return true;
- }
-
- return false;
-}
-
-static int cluster_alloc_swap(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
-{
- int n_ret = 0;
-
- VM_BUG_ON(!si->cluster_info);
-
- si->flags += SWP_SCANNING;
-
- while (n_ret < nr) {
- unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
-
- if (!offset)
- break;
- slots[n_ret++] = swp_entry(si->type, offset);
- }
-
- si->flags -= SWP_SCANNING;
-
- return n_ret;
+ smp_rmb();
+ return true;
}
-static int scan_swap_map_slots(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static bool swap_alloc_fast(swp_entry_t *entry,
+ int order)
{
- unsigned long offset;
- unsigned long scan_base;
- unsigned long last_in_cluster = 0;
- int latency_ration = LATENCY_LIMIT;
- unsigned int nr_pages = 1 << order;
- int n_ret = 0;
- bool scanned_many = false;
-
- /*
- * We try to cluster swap pages by allocating them sequentially
- * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
- * way, however, we resort to first-free allocation, starting
- * a new cluster. This prevents us from scattering swap pages
- * all over the entire swap partition, so that we reduce
- * overall disk seek times between swap pages. -- sct
- * But we do now try to find an empty cluster. -Andrea
- * And we let swap pages go all over an SSD partition. Hugh
- */
-
- if (order > 0) {
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (!IS_ENABLED(CONFIG_THP_SWAP) ||
- nr_pages > SWAPFILE_CLUSTER) {
- VM_WARN_ON_ONCE(1);
- return 0;
- }
-
- /*
- * Swapfile is not block device or not using clusters so unable
- * to allocate large entries.
- */
- if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
- return 0;
- }
-
- if (si->cluster_info)
- return cluster_alloc_swap(si, usage, nr, slots, order);
-
- si->flags += SWP_SCANNING;
-
- /* For HDD, sequential access is more important. */
- scan_base = si->cluster_next;
- offset = scan_base;
-
- if (unlikely(!si->cluster_nr--)) {
- if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
-
- spin_unlock(&si->lock);
-
- /*
- * If seek is expensive, start searching for new cluster from
- * start of partition, to minimize the span of allocated swap.
- */
- scan_base = offset = si->lowest_bit;
- last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-
- /* Locate the first empty (unaligned) cluster */
- for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) {
- if (si->swap_map[offset])
- last_in_cluster = offset + SWAPFILE_CLUSTER;
- else if (offset == last_in_cluster) {
- spin_lock(&si->lock);
- offset -= SWAPFILE_CLUSTER - 1;
- si->cluster_next = offset;
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- }
- }
-
- offset = scan_base;
- spin_lock(&si->lock);
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- }
-
-checks:
- if (!(si->flags & SWP_WRITEOK))
- goto no_page;
- if (!si->highest_bit)
- goto no_page;
- if (offset > si->highest_bit)
- scan_base = offset = si->lowest_bit;
-
- /* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- int swap_was_freed;
- spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
- spin_lock(&si->lock);
- /* entry was freed successfully, try to use this again */
- if (swap_was_freed > 0)
- goto checks;
- goto scan; /* check next one */
- }
-
- if (si->swap_map[offset]) {
- if (!n_ret)
- goto scan;
- else
- goto done;
- }
- memset(si->swap_map + offset, usage, nr_pages);
-
- swap_range_alloc(si, offset, nr_pages);
- slots[n_ret++] = swp_entry(si->type, offset);
-
- /* got enough slots or reach max slots? */
- if ((n_ret == nr) || (offset >= si->highest_bit))
- goto done;
-
- /* search for next available slot */
-
- /* time to take a break? */
- if (unlikely(--latency_ration < 0)) {
- if (n_ret)
- goto done;
- spin_unlock(&si->lock);
- cond_resched();
- spin_lock(&si->lock);
- latency_ration = LATENCY_LIMIT;
- }
-
- if (si->cluster_nr && !si->swap_map[++offset]) {
- /* non-ssd case, still more slots in cluster? */
- --si->cluster_nr;
- goto checks;
- }
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned int offset, found = SWAP_ENTRY_INVALID;
/*
- * Even if there's no free clusters available (fragmented),
- * try to scan a little more quickly with lock held unless we
- * have scanned too many slots already.
+ * Once allocated, swap_info_struct will never be completely freed,
+ * so checking it's liveness by get_swap_device_info is enough.
*/
- if (!scanned_many) {
- unsigned long scan_limit;
-
- if (offset < scan_base)
- scan_limit = scan_base;
- else
- scan_limit = si->highest_bit;
- for (; offset <= scan_limit && --latency_ration > 0;
- offset++) {
- if (!si->swap_map[offset])
- goto checks;
- }
- }
-
-done:
- if (order == 0)
- set_cluster_next(si, offset + 1);
- si->flags -= SWP_SCANNING;
- return n_ret;
+ si = this_cpu_read(percpu_swap_cluster.si[order]);
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ if (!si || !offset || !get_swap_device_info(si))
+ return false;
-scan:
- VM_WARN_ON(order > 0);
- spin_unlock(&si->lock);
- while (++offset <= READ_ONCE(si->highest_bit)) {
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- scanned_many = true;
- }
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
- }
- offset = si->lowest_bit;
- while (offset < scan_base) {
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- scanned_many = true;
- }
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
- offset++;
+ ci = lock_cluster(si, offset);
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
+ if (found)
+ *entry = swp_entry(si->type, found);
+ } else {
+ unlock_cluster(ci);
}
- spin_lock(&si->lock);
-no_page:
- si->flags -= SWP_SCANNING;
- return n_ret;
+ put_swap_device(si);
+ return !!found;
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
+/* Rotate the device and switch to a new cluster */
+static bool swap_alloc_slow(swp_entry_t *entry,
+ int order)
{
- int order = swap_entry_order(entry_order);
- unsigned long size = 1 << order;
- struct swap_info_struct *si, *next;
- long avail_pgs;
- int n_ret = 0;
int node;
+ unsigned long offset;
+ struct swap_info_struct *si, *next;
+ node = numa_node_id();
spin_lock(&swap_avail_lock);
-
- avail_pgs = atomic_long_read(&nr_swap_pages) / size;
- if (avail_pgs <= 0) {
- spin_unlock(&swap_avail_lock);
- goto noswap;
- }
-
- n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
-
- atomic_long_sub(n_goal * size, &nr_swap_pages);
-
start_over:
- node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
- /* requeue si to after same-priority siblings */
+ /* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
- spin_lock(&si->lock);
- if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_lists[node])) {
- spin_unlock(&si->lock);
- goto nextsi;
+ if (get_swap_device_info(si)) {
+ offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+ put_swap_device(si);
+ if (offset) {
+ *entry = swp_entry(si->type, offset);
+ return true;
}
- WARN(!si->highest_bit,
- "swap_info %d in list but !highest_bit\n",
- si->type);
- WARN(!(si->flags & SWP_WRITEOK),
- "swap_info %d in list but !SWP_WRITEOK\n",
- si->type);
- __del_from_avail_list(si);
- spin_unlock(&si->lock);
- goto nextsi;
+ if (order)
+ return false;
}
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries, order);
- spin_unlock(&si->lock);
- if (n_ret || size > 1)
- goto check_out;
- cond_resched();
spin_lock(&swap_avail_lock);
-nextsi:
/*
* if we got here, it's likely that si was almost full before,
* and since scan_swap_map_slots() can drop the si->lock,
@@ -1331,15 +1248,77 @@ nextsi:
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
-
spin_unlock(&swap_avail_lock);
+ return false;
+}
+
+/**
+ * folio_alloc_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
+ * @gfp: gfp mask for shadow nodes
+ *
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
+ *
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
+ */
+int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+{
+ unsigned int order = folio_order(folio);
+ unsigned int size = 1 << order;
+ swp_entry_t entry = {};
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
+
+ if (order) {
+ /*
+ * Reject large allocation when THP_SWAP is disabled,
+ * the caller should split the folio and try again.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP))
+ return -EAGAIN;
+
+ /*
+ * Allocation size should never exceed cluster size
+ * (HPAGE_PMD_SIZE).
+ */
+ if (size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
+
+ local_lock(&percpu_swap_cluster.lock);
+ if (!swap_alloc_fast(&entry, order))
+ swap_alloc_slow(&entry, order);
+ local_unlock(&percpu_swap_cluster.lock);
+
+ /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
+ if (mem_cgroup_try_charge_swap(folio, entry))
+ goto out_free;
-check_out:
- if (n_ret < n_goal)
- atomic_long_add((long)(n_goal - n_ret) * size,
- &nr_swap_pages);
-noswap:
- return n_ret;
+ if (!entry.val)
+ return -ENOMEM;
+
+ /*
+ * XArray node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
+ goto out_free;
+
+ atomic_long_sub(size, &nr_swap_pages);
+ return 0;
+
+out_free:
+ put_swap_folio(folio, entry);
+ return -ENOMEM;
}
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
@@ -1376,26 +1355,12 @@ out:
return NULL;
}
-static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
- struct swap_info_struct *q)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
-
- if (p != q) {
- if (q != NULL)
- spin_unlock(&q->lock);
- if (p != NULL)
- spin_lock(&p->lock);
- }
- return p;
-}
-
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
- unsigned long offset,
- unsigned char usage)
+static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry,
+ unsigned char usage)
{
+ unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
@@ -1427,7 +1392,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
if (usage)
WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
+ swap_entries_free(si, ci, entry, 1);
return usage;
}
@@ -1481,16 +1446,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
- if (!percpu_ref_tryget_live(&si->users))
+ if (!get_swap_device_info(si))
goto out;
- /*
- * Guarantee the si->users are checked before accessing other
- * fields of swap_info_struct.
- *
- * Paired with the spin_unlock() after setup_swap_info() in
- * enable_swap_info().
- */
- smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
goto put_out;
@@ -1506,121 +1463,127 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *si,
- swp_entry_t entry)
+static void swap_entries_put_cache(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
-
- ci = lock_cluster_or_swap_info(si, offset);
- usage = __swap_entry_free_locked(si, offset, 1);
- unlock_cluster_or_swap_info(si, ci);
- if (!usage)
- free_swap_slot(entry);
+ struct swap_cluster_info *ci;
- return usage;
+ ci = lock_cluster(si, offset);
+ if (swap_only_has_cache(si, offset, nr))
+ swap_entries_free(si, ci, entry, nr);
+ else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
+ unlock_cluster(ci);
}
-static bool __swap_entries_free(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
+static bool swap_entries_put_map(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
unsigned long offset = swp_offset(entry);
- unsigned int type = swp_type(entry);
struct swap_cluster_info *ci;
bool has_cache = false;
unsigned char count;
int i;
- if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ if (nr <= 1)
goto fallback;
- /* cross into another cluster */
- if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ count = swap_count(data_race(si->swap_map[offset]));
+ if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster_or_swap_info(si, ci);
- goto fallback;
+ goto locked_fallback;
}
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
- unlock_cluster_or_swap_info(si, ci);
-
- if (!has_cache) {
+ if (!has_cache)
+ swap_entries_free(si, ci, entry, nr);
+ else
for (i = 0; i < nr; i++)
- zswap_invalidate(swp_entry(si->type, offset + i));
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, nr);
- spin_unlock(&si->lock);
- }
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
+ unlock_cluster(ci);
+
return has_cache;
fallback:
- for (i = 0; i < nr; i++) {
- if (data_race(si->swap_map[offset + i])) {
- count = __swap_entry_free(si, swp_entry(type, offset + i));
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
+ ci = lock_cluster(si, offset);
+locked_fallback:
+ for (i = 0; i < nr; i++, entry.val++) {
+ count = swap_entry_put_locked(si, ci, entry, 1);
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
+ }
+ unlock_cluster(ci);
+ return has_cache;
+
+}
+
+/*
+ * Only functions with "_nr" suffix are able to free entries spanning
+ * cross multi clusters, so ensure the range is within a single cluster
+ * when freeing entries with functions without "_nr" suffix.
+ */
+static bool swap_entries_put_map_nr(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
+{
+ int cluster_nr, cluster_rest;
+ unsigned long offset = swp_offset(entry);
+ bool has_cache = false;
+
+ cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
+ while (nr) {
+ cluster_nr = min(nr, cluster_rest);
+ has_cache |= swap_entries_put_map(si, entry, cluster_nr);
+ cluster_rest = SWAPFILE_CLUSTER;
+ nr -= cluster_nr;
+ entry.val += cluster_nr;
}
+
return has_cache;
}
/*
- * Drop the last HAS_CACHE flag of swap entries, caller have to
- * ensure all entries belong to the same cgroup.
+ * Check if it's the last ref of swap entry in the freeing path.
+ * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ */
+static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
+{
+ return (count == SWAP_HAS_CACHE) || (count == 1) ||
+ (count == SWAP_MAP_SHMEM);
+}
+
+/*
+ * Drop the last ref of swap entries, caller have to ensure all entries
+ * belong to the same cgroup and cluster.
*/
-static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
- unsigned int nr_pages)
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
unsigned char *map_end = map + nr_pages;
- struct swap_cluster_info *ci;
- ci = lock_cluster(si, offset);
+ /* It should never free entries across different clusters */
+ VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
+ VM_BUG_ON(cluster_is_empty(ci));
+ VM_BUG_ON(ci->count < nr_pages);
+
+ ci->count -= nr_pages;
do {
- VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
- dec_cluster_info_page(si, ci, nr_pages);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
-}
-
-static void cluster_swap_free_nr(struct swap_info_struct *si,
- unsigned long offset, int nr_pages,
- unsigned char usage)
-{
- struct swap_cluster_info *ci;
- DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 };
- int i, nr;
- ci = lock_cluster_or_swap_info(si, offset);
- while (nr_pages) {
- nr = min(BITS_PER_LONG, nr_pages);
- for (i = 0; i < nr; i++) {
- if (!__swap_entry_free_locked(si, offset + i, usage))
- bitmap_set(to_free, i, 1);
- }
- if (!bitmap_empty(to_free, BITS_PER_LONG)) {
- unlock_cluster_or_swap_info(si, ci);
- for_each_set_bit(i, to_free, BITS_PER_LONG)
- free_swap_slot(swp_entry(si->type, offset + i));
- if (nr == nr_pages)
- return;
- bitmap_clear(to_free, 0, BITS_PER_LONG);
- ci = lock_cluster_or_swap_info(si, offset);
- }
- offset += nr;
- nr_pages -= nr;
- }
- unlock_cluster_or_swap_info(si, ci);
+ if (!ci->count)
+ free_cluster(si, ci);
+ else
+ partial_free_cluster(si, ci);
}
/*
@@ -1639,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr, 1);
+ swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
offset += nr;
nr_pages -= nr;
}
@@ -1650,8 +1613,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
struct swap_info_struct *si;
int size = 1 << swap_entry_order(folio_order(folio));
@@ -1659,59 +1620,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster_or_swap_info(si, offset);
- if (size > 1 && swap_is_has_cache(si, offset, size)) {
- unlock_cluster_or_swap_info(si, ci);
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, size);
- spin_unlock(&si->lock);
- return;
- }
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
- unlock_cluster_or_swap_info(si, ci);
- free_swap_slot(entry);
- if (i == size - 1)
- return;
- lock_cluster_or_swap_info(si, offset);
- }
- }
- unlock_cluster_or_swap_info(si, ci);
-}
-
-static int swp_entry_cmp(const void *ent1, const void *ent2)
-{
- const swp_entry_t *e1 = ent1, *e2 = ent2;
-
- return (int)swp_type(*e1) - (int)swp_type(*e2);
-}
-
-void swapcache_free_entries(swp_entry_t *entries, int n)
-{
- struct swap_info_struct *p, *prev;
- int i;
-
- if (n <= 0)
- return;
-
- prev = NULL;
- p = NULL;
-
- /*
- * Sort swap entries by swap device, so each lock is only taken once.
- * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
- * so low that it isn't necessary to optimize further.
- */
- if (nr_swapfiles > 1)
- sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
- for (i = 0; i < n; ++i) {
- p = swap_info_get_cont(entries[i], prev);
- if (p)
- swap_entry_range_free(p, entries[i], 1);
- prev = p;
- }
- if (p)
- spin_unlock(&p->lock);
+ swap_entries_put_cache(si, entry, size);
}
int __swap_count(swp_entry_t entry)
@@ -1727,16 +1636,16 @@ int __swap_count(swp_entry_t entry)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
int count;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
- return count;
+ unlock_cluster(ci);
+ return !!count;
}
/*
@@ -1758,7 +1667,7 @@ int swp_swapcount(swp_entry_t entry)
offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
@@ -1781,7 +1690,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return count;
}
@@ -1796,8 +1705,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
int i;
bool ret = false;
- ci = lock_cluster_or_swap_info(si, offset);
- if (!ci || nr_pages == 1) {
+ ci = lock_cluster(si, offset);
+ if (nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
@@ -1809,7 +1718,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
}
}
unlock_out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return ret;
}
@@ -1822,7 +1731,7 @@ static bool folio_swapped(struct folio *folio)
return false;
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
- return swap_swapcount(si, entry) != 0;
+ return swap_entry_swapped(si, entry);
return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}
@@ -1896,9 +1805,6 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
bool any_only_cache = false;
unsigned long offset;
- if (non_swap_entry(entry))
- return;
-
si = get_swap_device(entry);
if (!si)
return;
@@ -1909,7 +1815,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- any_only_cache = __swap_entries_free(si, entry, nr);
+ any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1919,13 +1825,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
goto out;
/*
- * Now go back over the range trying to reclaim the swap cache. This is
- * more efficient for large folios because we will only try to reclaim
- * the swap once per folio in the common case. If we do
- * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
- * latter will get a reference and lock the folio for every individual
- * page but will only succeed once the swap slot for every subpage is
- * zero.
+ * Now go back over the range trying to reclaim the swap cache.
*/
for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1;
@@ -1957,16 +1857,23 @@ out:
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ unsigned long offset;
swp_entry_t entry = {0};
if (!si)
goto fail;
/* This is called for allocating swap entry, not cache */
- spin_lock(&si->lock);
- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
- atomic_long_dec(&nr_swap_pages);
- spin_unlock(&si->lock);
+ if (get_swap_device_info(si)) {
+ if (si->flags & SWP_WRITEOK) {
+ offset = cluster_alloc_swap_entry(si, 0, 1);
+ if (offset) {
+ entry = swp_entry(si->type, offset);
+ atomic_long_dec(&nr_swap_pages);
+ }
+ }
+ put_swap_device(si);
+ }
fail:
return entry;
}
@@ -2057,7 +1964,7 @@ unsigned int count_swap_pages(int type, int free)
if (sis->flags & SWP_WRITEOK) {
n = sis->pages;
if (free)
- n -= sis->inuse_pages;
+ n -= swap_usage_in_pages(sis);
}
spin_unlock(&sis->lock);
}
@@ -2392,7 +2299,7 @@ static int try_to_unuse(unsigned int type)
swp_entry_t entry;
unsigned int i;
- if (!READ_ONCE(si->inuse_pages))
+ if (!swap_usage_in_pages(si))
goto success;
retry:
@@ -2405,7 +2312,7 @@ retry:
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
@@ -2433,7 +2340,7 @@ retry:
mmput(prev_mm);
i = 0;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i)) != 0) {
@@ -2464,11 +2371,11 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
- if (READ_ONCE(si->inuse_pages)) {
+ if (swap_usage_in_pages(si)) {
if (!signal_pending(current))
goto retry;
return -EINTR;
@@ -2495,7 +2402,7 @@ static void drain_mmlist(void)
unsigned int type;
for (type = 0; type < nr_swapfiles; type++)
- if (swap_info[type]->inuse_pages)
+ if (swap_usage_in_pages(swap_info[type]))
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -2674,7 +2581,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
static void _enable_swap_info(struct swap_info_struct *si)
{
- si->flags |= SWP_WRITEOK;
atomic_long_add(si->pages, &nr_swap_pages);
total_swap_pages += si->pages;
@@ -2691,9 +2597,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
*/
plist_add(&si->list, &swap_active_head);
- /* add to available list iff swap device is not full */
- if (si->highest_bit)
- add_to_avail_list(si);
+ /* Add back to available list */
+ add_to_avail_list(si, true);
}
static void enable_swap_info(struct swap_info_struct *si, int prio,
@@ -2727,21 +2632,46 @@ static void reinsert_swap_info(struct swap_info_struct *si)
spin_unlock(&swap_lock);
}
-static bool __has_usable_swap(void)
+/*
+ * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
+ * see the updated flags, so there will be no more allocations.
+ */
+static void wait_for_allocation(struct swap_info_struct *si)
{
- return !plist_head_empty(&swap_active_head);
+ unsigned long offset;
+ unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
+ struct swap_cluster_info *ci;
+
+ BUG_ON(si->flags & SWP_WRITEOK);
+
+ for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
+ ci = lock_cluster(si, offset);
+ unlock_cluster(ci);
+ }
}
-bool has_usable_swap(void)
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_cluster(struct swap_info_struct *si)
{
- bool ret;
+ int cpu, i;
+ struct swap_info_struct **pcp_si;
- spin_lock(&swap_lock);
- ret = __has_usable_swap();
- spin_unlock(&swap_lock);
- return ret;
+ for_each_possible_cpu(cpu) {
+ pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
+ /*
+ * Invalidate the percpu swap cluster cache, si->users
+ * is dead, so no new user will point to it, just flush
+ * any existing user.
+ */
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cmpxchg(&pcp_si[i], si, NULL);
+ }
}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -2791,7 +2721,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out_dput;
}
spin_lock(&p->lock);
- del_from_avail_list(p);
+ del_from_avail_list(p, true);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
@@ -2809,11 +2739,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
- p->flags &= ~SWP_WRITEOK;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- disable_swap_slots_cache_lock();
+ wait_for_allocation(p);
set_current_oom_origin();
err = try_to_unuse(p->type);
@@ -2822,12 +2751,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
- reenable_swap_slots_cache_unlock();
goto out_dput;
}
- reenable_swap_slots_cache_unlock();
-
/*
* Wait for swap operations protected by get/put_swap_device()
* to complete. Because of synchronize_rcu() here, all swap
@@ -2842,6 +2768,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
flush_work(&p->discard_work);
flush_work(&p->reclaim_work);
+ flush_percpu_swap_cluster(p);
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
@@ -2855,16 +2782,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
- /* wait for anyone still in scan_swap_map_slots */
- p->highest_bit = 0; /* cuts scans short */
- while (p->flags >= SWP_SCANNING) {
- spin_unlock(&p->lock);
- spin_unlock(&swap_lock);
- schedule_timeout_uninterruptible(1);
- spin_lock(&swap_lock);
- spin_lock(&p->lock);
- }
-
swap_file = p->swap_file;
p->swap_file = NULL;
p->max = 0;
@@ -2879,10 +2796,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
arch_swap_invalidate_area(p->type);
zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
- free_percpu(p->cluster_next_cpu);
- p->cluster_next_cpu = NULL;
+ kfree(p->global_cluster);
+ p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
kvfree(cluster_info);
@@ -2992,7 +2907,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
bytes = K(si->pages);
- inuse = K(READ_ONCE(si->inuse_pages));
+ inuse = K(swap_usage_in_pages(si));
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -3109,6 +3024,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
init_completion(&p->comp);
return p;
@@ -3193,10 +3109,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return 0;
}
- si->lowest_bit = 1;
- si->cluster_next = 1;
- si->cluster_nr = 0;
-
maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
if (!last_page) {
@@ -3213,7 +3125,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
if ((unsigned int)maxpages == 0)
maxpages = UINT_MAX;
}
- si->highest_bit = maxpages - 1;
if (!maxpages)
return 0;
@@ -3230,13 +3141,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return maxpages;
}
-#define SWAP_CLUSTER_INFO_COLS \
- DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
-#define SWAP_CLUSTER_SPACE_COLS \
- DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
-#define SWAP_CLUSTER_COLS \
- max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-
static int setup_swap_map_and_extents(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned char *swap_map,
@@ -3276,15 +3180,21 @@ static int setup_swap_map_and_extents(struct swap_info_struct *si,
return nr_extents;
}
+#define SWAP_CLUSTER_INFO_COLS \
+ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS \
+ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS \
+ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
struct swap_cluster_info *cluster_info;
- unsigned long i, j, k, idx;
- int cpu, err = -ENOMEM;
+ unsigned long i, j, idx;
+ int err = -ENOMEM;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3293,25 +3203,14 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- si->cluster_next_cpu = alloc_percpu(unsigned int);
- if (!si->cluster_next_cpu)
- goto err_free;
-
- /* Random start position to help with wear leveling */
- for_each_possible_cpu(cpu)
- per_cpu(*si->cluster_next_cpu, cpu) =
- get_random_u32_inclusive(1, si->highest_bit);
-
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
-
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ si->global_cluster = kmalloc(sizeof(*si->global_cluster),
+ GFP_KERNEL);
+ if (!si->global_cluster)
+ goto err_free;
for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_NEXT_INVALID;
+ si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
+ spin_lock_init(&si->global_cluster_lock);
}
/*
@@ -3335,15 +3234,14 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- si->frag_cluster_nr[i] = 0;
+ atomic_long_set(&si->frag_cluster_nr[i], 0);
}
/*
* Reduce false cache line sharing between cluster_info and
* sharing same address space.
*/
- for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
- j = (k + col) % SWAP_CLUSTER_COLS;
+ for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
struct swap_cluster_info *ci;
idx = i * SWAP_CLUSTER_COLS + j;
@@ -3437,6 +3335,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/*
+ * The swap subsystem needs a major overhaul to support this.
+ * It doesn't work yet so just disable it for now.
+ */
+ if (mapping_min_folio_order(mapping) > 0) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
* Read the swap header.
*/
if (!mapping->a_ops->read_folio) {
@@ -3493,18 +3400,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (si->bdev && bdev_nonrot(si->bdev)) {
si->flags |= SWP_SOLIDSTATE;
-
- cluster_info = setup_clusters(si, swap_header, maxpages);
- if (IS_ERR(cluster_info)) {
- error = PTR_ERR(cluster_info);
- cluster_info = NULL;
- goto bad_swap_unlock_inode;
- }
} else {
atomic_inc(&nr_rotate_swap);
inced_nr_rotate_swap = true;
}
+ cluster_info = setup_clusters(si, swap_header, maxpages);
+ if (IS_ERR(cluster_info)) {
+ error = PTR_ERR(cluster_info);
+ cluster_info = NULL;
+ goto bad_swap_unlock_inode;
+ }
+
if ((swap_flags & SWAP_FLAG_DISCARD) &&
si->bdev && bdev_max_discard_sectors(si->bdev)) {
/*
@@ -3558,8 +3465,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
- prio =
- (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ prio = swap_flags & SWAP_FLAG_PRIO_MASK;
enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
@@ -3583,10 +3489,8 @@ free_swap_address_space:
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
- free_percpu(si->percpu_cluster);
- si->percpu_cluster = NULL;
- free_percpu(si->cluster_next_cpu);
- si->cluster_next_cpu = NULL;
+ kfree(si->global_cluster);
+ si->global_cluster = NULL;
inode = NULL;
destroy_swap_extents(si);
swap_cgroup_swapoff(si->type);
@@ -3608,8 +3512,6 @@ out:
putname(name);
if (inode)
inode_unlock(inode);
- if (!error)
- enable_swap_slots_cache();
return error;
}
@@ -3623,7 +3525,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += READ_ONCE(si->inuse_pages);
+ nr_to_be_unused += swap_usage_in_pages(si);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -3636,7 +3538,6 @@ void si_swapinfo(struct sysinfo *val)
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
- * - swp_entry is migration entry -> EINVAL
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
@@ -3651,11 +3552,15 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
VM_WARN_ON(usage == 1 && nr > 1);
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
err = 0;
for (i = 0; i < nr; i++) {
@@ -3710,7 +3615,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
}
unlock_out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return err;
}
@@ -3752,11 +3657,13 @@ int swapcache_prepare(swp_entry_t entry, int nr)
return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
+/*
+ * Caller should ensure entries belong to the same folio so
+ * the entries won't span cross cluster boundary.
+ */
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- unsigned long offset = swp_offset(entry);
-
- cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
+ swap_entries_put_cache(si, entry, nr);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
@@ -3765,21 +3672,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
}
/*
- * out-of-line methods to avoid include hell.
- */
-struct address_space *swapcache_mapping(struct folio *folio)
-{
- return swp_swap_info(folio->swap)->swap_file->f_mapping;
-}
-EXPORT_SYMBOL_GPL(swapcache_mapping);
-
-pgoff_t __folio_swap_cache_index(struct folio *folio)
-{
- return swap_cache_index(folio->swap);
-}
-EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -3819,7 +3711,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
goto outer;
}
- spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3884,7 +3775,6 @@ out_unlock_cont:
spin_unlock(&si->cont_lock);
out:
unlock_cluster(ci);
- spin_unlock(&si->lock);
put_swap_device(si);
outer:
if (page)
@@ -3898,8 +3788,8 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
- * lock.
+ * Called while __swap_duplicate() or caller of swap_entry_put_locked()
+ * holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
pgoff_t offset, unsigned char count)
@@ -4004,6 +3894,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static bool __has_usable_swap(void)
+{
+ return !plist_head_empty(&swap_active_head);
+}
+
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
diff --git a/mm/truncate.c b/mm/truncate.c
index 7c304d2f0052..91eb92a5ce4f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -78,8 +78,22 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
if (dax_mapping(mapping)) {
for (i = j; i < nr; i++) {
- if (xa_is_value(fbatch->folios[i]))
+ if (xa_is_value(fbatch->folios[i])) {
+ /*
+ * File systems should already have called
+ * dax_break_layout_entry() to remove all DAX
+ * entries while holding a lock to prevent
+ * establishing new entries. Therefore we
+ * shouldn't find any here.
+ */
+ WARN_ON_ONCE(1);
+
+ /*
+ * Delete the mapping so truncate_pagecache()
+ * doesn't loop forever.
+ */
dax_delete_mapping_entry(mapping, indices[i]);
+ }
}
goto out;
}
@@ -177,20 +191,21 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
+ struct page *split_at, *split_at2;
if (pos < start)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -207,8 +222,46 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
- if (split_folio(folio) == 0)
+
+ split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
+ if (!try_folio_split(folio, split_at, NULL)) {
+ /*
+ * try to split at offset + length to make sure folios within
+ * the range can be dropped, especially to avoid memory waste
+ * for shmem truncate
+ */
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
+
+ if (!folio_try_get(folio2))
+ goto no_split;
+
+ if (!folio_test_large(folio2))
+ goto out;
+
+ if (!folio_trylock(folio2))
+ goto out;
+
+ /*
+ * make sure folio2 is large and does not change its mapping.
+ * Its split result does not matter here.
+ */
+ if (folio_test_large(folio2) &&
+ folio2->mapping == folio->mapping)
+ try_folio_split(folio2, split_at2, NULL);
+
+ folio_unlock(folio2);
+out:
+ folio_put(folio2);
+no_split:
return true;
+ }
if (folio_test_dirty(folio))
return false;
truncate_inode_folio(folio->mapping, folio);
@@ -372,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;
@@ -525,6 +578,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(invalidate_mapping_pages);
+static int folio_launder(struct address_space *mapping, struct folio *folio)
+{
+ if (!folio_test_dirty(folio))
+ return 0;
+ if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
+ return 0;
+ return mapping->a_ops->launder_folio(folio);
+}
+
/*
* This is like mapping_evict_folio(), except it ignores the folio's
* refcount. We do this because invalidate_inode_pages2() needs stronger
@@ -532,14 +594,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* shrink_folio_list() has a temp ref on them, or because they're transiently
* sitting in the folio_add_lru() caches.
*/
-static int invalidate_complete_folio2(struct address_space *mapping,
- struct folio *folio)
+int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
+ gfp_t gfp)
{
- if (folio->mapping != mapping)
- return 0;
+ int ret;
- if (!filemap_release_folio(folio, GFP_KERNEL))
- return 0;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
+ BUG_ON(folio_mapped(folio));
+
+ ret = folio_launder(mapping, folio);
+ if (ret)
+ return ret;
+ if (folio->mapping != mapping)
+ return -EBUSY;
+ if (!filemap_release_folio(folio, gfp))
+ return -EBUSY;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
@@ -558,16 +630,7 @@ static int invalidate_complete_folio2(struct address_space *mapping,
failed:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
- return 0;
-}
-
-static int folio_launder(struct address_space *mapping, struct folio *folio)
-{
- if (!folio_test_dirty(folio))
- return 0;
- if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
- return 0;
- return mapping->a_ops->launder_folio(folio);
+ return -EBUSY;
}
/**
@@ -631,16 +694,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
-
- if (folio_mapped(folio))
- unmap_mapping_folio(folio);
- BUG_ON(folio_mapped(folio));
-
- ret2 = folio_launder(mapping, folio);
- if (ret2 == 0) {
- if (!invalidate_complete_folio2(mapping, folio))
- ret2 = -EBUSY;
- }
+ ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
if (ret2 < 0)
ret = ret2;
folio_unlock(folio);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 83c164aba6e0..dbdcc43964fb 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,7 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
@@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
-static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+EXPORT_SYMBOL(validate_usercopy_range);
/*
* Validates that the given object is:
@@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- if (static_branch_unlikely(&bypass_usercopy_checks))
- return;
-
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
}
EXPORT_SYMBOL(__check_object_size);
-static bool enable_checks __initdata = true;
+static bool enable_checks __initdata =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);
static int __init parse_hardened_usercopy(char *str)
{
@@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy);
static int __init set_hardened_usercopy(void)
{
- if (enable_checks == false)
- static_branch_enable(&bypass_usercopy_checks);
+ if (enable_checks)
+ static_branch_enable(&validate_usercopy_range);
+ else
+ static_branch_disable(&validate_usercopy_range);
return 1;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 60a0be33766f..8253978ee0fb 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -85,14 +86,10 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
mmap_read_lock(mm);
vma = find_vma_and_prepare_anon(mm, address);
if (!IS_ERR(vma)) {
- /*
- * We cannot use vma_start_read() as it may fail due to
- * false locked (see comment in vma_start_read()). We
- * can avoid that by directly locking vm_lock under
- * mmap_lock, which guarantees that nobody can lock the
- * vma for write (vma_start_write()) under us.
- */
- down_read(&vma->vm_lock->lock);
+ bool locked = vma_start_read_locked(vma);
+
+ if (!locked)
+ vma = ERR_PTR(-EAGAIN);
}
mmap_read_unlock(mm);
@@ -1020,6 +1017,14 @@ void double_pt_unlock(spinlock_t *ptl1,
__release(ptl2);
}
+static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval)
+{
+ return pte_same(ptep_get(src_pte), orig_src_pte) &&
+ pte_same(ptep_get(dst_pte), orig_dst_pte) &&
+ pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
+}
static int move_present_pte(struct mm_struct *mm,
struct vm_area_struct *dst_vma,
@@ -1027,6 +1032,7 @@ static int move_present_pte(struct mm_struct *mm,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
struct folio *src_folio)
{
@@ -1034,8 +1040,8 @@ static int move_present_pte(struct mm_struct *mm,
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
err = -EAGAIN;
goto out;
}
@@ -1057,9 +1063,14 @@ static int move_present_pte(struct mm_struct *mm,
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1067,24 +1078,65 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1097,13 +1149,14 @@ static int move_zeropage_pte(struct mm_struct *mm,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl)
{
pte_t zero_pte;
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
@@ -1130,12 +1183,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
pmd_t dummy_pmdval;
+ pmd_t dst_pmdval;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
@@ -1148,11 +1203,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
retry:
/*
* Use the maywrite version to indicate that dst_pte will be modified,
- * but since we will use pte_same() to detect the change of the pte
- * entry, there is no need to get pmdval, so just pass a dummy variable
- * to it.
+ * since dst_pte needs to be none, the subsequent pte_same() check
+ * cannot prevent the dst_pte page from being freed concurrently, so we
+ * also need to abtain dst_pmdval and recheck pmd_same() later.
*/
- dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval,
+ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
&dst_ptl);
/* Retry if a huge pmd materialized from under us */
@@ -1161,7 +1216,11 @@ retry:
goto out;
}
- /* same as dst_pte */
+ /*
+ * Unlike dst_pte, the subsequent pte_same() check can ensure the
+ * stability of the src_pte page, so there is no need to get pmdval,
+ * just pass a dummy variable to it.
+ */
src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
&src_ptl);
@@ -1177,8 +1236,8 @@ retry:
}
/* Sanity checks before the operation */
- if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) ||
- WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
+ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
+ pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
err = -EINVAL;
goto out;
}
@@ -1213,7 +1272,7 @@ retry:
err = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ dst_pmd, dst_pmdval, dst_ptl, src_ptl);
goto out;
}
@@ -1224,6 +1283,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
+ bool locked;
/*
* Pin the page while holding the lock to be sure the
@@ -1243,14 +1303,28 @@ retry:
goto out;
}
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
+
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
- if (!folio_trylock(src_folio)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1266,8 +1340,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1292,8 +1366,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1303,14 +1377,16 @@ retry:
err = move_present_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl, src_folio);
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1319,10 +1395,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr,
- dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
@@ -1339,6 +1458,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
@@ -1475,16 +1596,24 @@ static int uffd_move_lock(struct mm_struct *mm,
mmap_read_lock(mm);
err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
- if (!err) {
- /*
- * See comment in uffd_lock_vma() as to why not using
- * vma_start_read() here.
- */
- down_read(&(*dst_vmap)->vm_lock->lock);
- if (*dst_vmap != *src_vmap)
- down_read_nested(&(*src_vmap)->vm_lock->lock,
- SINGLE_DEPTH_NESTING);
+ if (err)
+ goto out;
+
+ if (!vma_start_read_locked(*dst_vmap)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ /* Nothing further to do if both vmas are locked. */
+ if (*dst_vmap == *src_vmap)
+ goto out;
+
+ if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
+ /* Undo dst_vmap locking if src_vmap failed to lock */
+ vma_end_read(*dst_vmap);
+ err = -EAGAIN;
}
+out:
mmap_read_unlock(mm);
return err;
}
@@ -1810,6 +1939,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1817,7 +1954,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1868,7 +2005,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
diff --git a/mm/util.c b/mm/util.c
index c1c3b06ab4f9..0b270c43d7d1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,6 +12,7 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
@@ -23,6 +24,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -297,12 +299,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
{
char *p;
- /*
- * Always use GFP_KERNEL, since copy_from_user() can sleep and
- * cause pagefault, which makes it pointless to use GFP_NOFS
- * or GFP_ATOMIC.
- */
- p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+ p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -574,6 +571,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -587,6 +586,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
return ret;
}
+/*
+ * Perform a userland memory mapping into the current process address space. See
+ * the comment for do_mmap() for more details on this operation in general.
+ *
+ * This differs from do_mmap() in that:
+ *
+ * a. An offset parameter is provided rather than pgoff, which is both checked
+ * for overflow and page alignment.
+ * b. mmap locking is performed on the caller's behalf.
+ * c. Userfaultfd unmap events and memory population are handled.
+ *
+ * This means that this function performs essentially the same work as if
+ * userland were invoking mmap (2).
+ *
+ * Returns either an error, or the address at which the requested mapping has
+ * been performed.
+ */
unsigned long vm_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
@@ -600,168 +616,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
-{
- /*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- flags |= __GFP_NOWARN;
-
- if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- flags &= ~__GFP_NOFAIL;
- }
-
- return flags;
-}
-
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
-{
- void *ret;
-
- /*
- * It doesn't really make sense to fallback to vmalloc for sub page
- * requests
- */
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
- kmalloc_gfp_adjust(flags, size),
- node);
- if (ret || size <= PAGE_SIZE)
- return ret;
-
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
- /* Don't even allow crazy sizes */
- if (unlikely(size > INT_MAX)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- /*
- * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
- * since the callers already cannot assume anything
- * about the resulting pointer, and cannot play
- * protection games.
- */
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__kvmalloc_node_noprof);
-
-/**
- * kvfree() - Free memory.
- * @addr: Pointer to allocated memory.
- *
- * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
- * It is slightly more efficient to use kfree() or vfree() if you are certain
- * that you know which one to use.
- *
- * Context: Either preemptible task context or not-NMI interrupt.
- */
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-/**
- * kvfree_sensitive - Free a data object containing sensitive information.
- * @addr: address of the data object to be freed.
- * @len: length of the data object.
- *
- * Use the special memzero_explicit() function to clear the content of a
- * kvmalloc'ed object containing sensitive data to make sure that the
- * compiler won't optimize out the data clearing.
- */
-void kvfree_sensitive(const void *addr, size_t len)
-{
- if (likely(!ZERO_OR_NULL_PTR(addr))) {
- memzero_explicit((void *)addr, len);
- kvfree(addr);
- }
-}
-EXPORT_SYMBOL(kvfree_sensitive);
-
-/**
- * kvrealloc - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @flags: the flags for the page level allocator
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
-{
- void *n;
-
- if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
-
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
- if (!n) {
- /* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
- if (!n)
- return NULL;
-
- if (p) {
- /* We already know that `p` is not a vmalloc address. */
- kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
- kasan_enable_current();
-
- kfree(p);
- }
- }
-
- return n;
-}
-EXPORT_SYMBOL(kvrealloc_noprof);
-
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
@@ -894,14 +748,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src)
EXPORT_SYMBOL(folio_mc_copy);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
-int sysctl_overcommit_ratio __read_mostly = 50;
-unsigned long sysctl_overcommit_kbytes __read_mostly;
+static int sysctl_overcommit_ratio __read_mostly = 50;
+static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SYSCTL
+
+static int overcommit_ratio_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -916,8 +772,8 @@ static void sync_overcommit_as(struct work_struct *dummy)
percpu_counter_sync(&vm_committed_as);
}
-int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_policy_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int new_policy = -1;
@@ -952,8 +808,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
-int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -963,6 +819,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
+static const struct ctl_table util_sysctl_table[] = {
+ {
+ .procname = "overcommit_memory",
+ .data = &sysctl_overcommit_memory,
+ .maxlen = sizeof(sysctl_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = overcommit_policy_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "overcommit_ratio",
+ .data = &sysctl_overcommit_ratio,
+ .maxlen = sizeof(sysctl_overcommit_ratio),
+ .mode = 0644,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
+ },
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+};
+
+static int __init init_vm_util_sysctls(void)
+{
+ register_sysctl_init("vm", util_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_vm_util_sysctls);
+#endif /* CONFIG_SYSCTL */
+
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
@@ -1227,3 +1131,43 @@ void flush_dcache_folio(struct folio *folio)
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif
+
+/**
+ * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA
+ * @file: The file which possesss an f_op->mmap_prepare() hook
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ *
+ * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
+ * 'wrapper' file systems invoke a nested mmap hook of an underlying file.
+ *
+ * Until all filesystems are converted to use .mmap_prepare(), we must be
+ * conservative and continue to invoke these 'wrapper' filesystems using the
+ * deprecated .mmap() hook.
+ *
+ * However we have a problem if the underlying file system possesses an
+ * .mmap_prepare() hook, as we are in a different context when we invoke the
+ * .mmap() hook, already having a VMA to deal with.
+ *
+ * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * establishes a struct vm_area_desc descriptor, passes to the underlying
+ * .mmap_prepare() hook and applies any changes performed by it.
+ *
+ * Once the conversion of filesystems is complete this function will no longer
+ * be required and will be removed.
+ *
+ * Returns: 0 on success or error.
+ */
+int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc;
+ int err;
+
+ err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
+ if (err)
+ return err;
+ set_vma_from_desc(vma, &desc);
+
+ return 0;
+}
+EXPORT_SYMBOL(compat_vma_mmap_prepare);
diff --git a/mm/vma.c b/mm/vma.c
index 8a454a7bbc80..fef67a66a095 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -17,9 +17,13 @@ struct mmap_state {
unsigned long pglen;
unsigned long flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_prepare(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
- bool retry_merge;
struct vm_area_struct *prev;
struct vm_area_struct *next;
@@ -35,11 +39,12 @@ struct mmap_state {
.mm = mm_, \
.vmi = vmi_, \
.addr = addr_, \
- .end = (addr_) + len, \
+ .end = (addr_) + (len_), \
.pgoff = pgoff_, \
.pglen = PHYS_PFN(len_), \
.flags = flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -52,12 +57,27 @@ struct mmap_state {
.pgoff = (map_)->pgoff, \
.file = (map_)->file, \
.prev = (map_)->prev, \
- .vma = vma_, \
+ .middle = vma_, \
.next = (vma_) ? NULL : (map_)->next, \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
+/*
+ * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
+ * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
+ * would mean a wider range of folios sharing the root anon_vma lock, and thus
+ * potential lock contention, we do not wish to encourage merging such that this
+ * scales to a problem.
+ */
+static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
+}
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -83,53 +103,75 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return true;
}
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
+ struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct anon_vma *tgt_anon = tgt->anon_vma;
+ struct anon_vma *src_anon = vmg->anon_vma;
+
/*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
+ * will remove the existing VMA's anon_vma's so there's no scalability
+ * concerns.
*/
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
+ VM_WARN_ON(src && src_anon != src->anon_vma);
-/* Are the anon_vma's belonging to each VMA compatible with one another? */
-static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
- struct vm_area_struct *vma2)
-{
- return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+ /* Case 1 - we will dup_anon_vma() from src into tgt. */
+ if (!tgt_anon && src_anon)
+ return !vma_had_uncowed_parents(src);
+ /* Case 2 - we will simply use tgt's anon_vma. */
+ if (tgt_anon && !src_anon)
+ return !vma_had_uncowed_parents(tgt);
+ /* Case 3 - the anon_vma's are already shared. */
+ return src_anon == tgt_anon;
}
/*
* init_multi_vma_prep() - Initializer for struct vma_prepare
* @vp: The vma_prepare struct
* @vma: The vma that will be altered once locked
- * @next: The next vma if it is to be adjusted
- * @remove: The first vma to be removed
- * @remove2: The second vma to be removed
+ * @vmg: The merge state that will be used to determine adjustment and VMA
+ * removal.
*/
static void init_multi_vma_prep(struct vma_prepare *vp,
struct vm_area_struct *vma,
- struct vm_area_struct *next,
- struct vm_area_struct *remove,
- struct vm_area_struct *remove2)
+ struct vma_merge_struct *vmg)
{
+ struct vm_area_struct *adjust;
+ struct vm_area_struct **remove = &vp->remove;
+
memset(vp, 0, sizeof(struct vma_prepare));
vp->vma = vma;
vp->anon_vma = vma->anon_vma;
- vp->remove = remove;
- vp->remove2 = remove2;
- vp->adj_next = next;
- if (!vp->anon_vma && next)
- vp->anon_vma = next->anon_vma;
+
+ if (vmg && vmg->__remove_middle) {
+ *remove = vmg->middle;
+ remove = &vp->remove2;
+ }
+ if (vmg && vmg->__remove_next)
+ *remove = vmg->next;
+
+ if (vmg && vmg->__adjust_middle_start)
+ adjust = vmg->middle;
+ else if (vmg && vmg->__adjust_next_start)
+ adjust = vmg->next;
+ else
+ adjust = NULL;
+
+ vp->adj_next = adjust;
+ if (!vp->anon_vma && adjust)
+ vp->anon_vma = adjust->anon_vma;
+
+ VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
+ vp->anon_vma != adjust->anon_vma);
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -150,7 +192,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
@@ -170,7 +212,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
@@ -203,6 +245,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
}
/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_lock and by
+ * the root anon_vma's mutex.
+ */
+static void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+/*
* vma_prepare() - Helper function for handling locking VMAs prior to altering
* @vp: The initialized vma_prepare struct
*/
@@ -274,7 +348,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- vma_iter_store(vmi, vp->insert);
+ vma_iter_store_new(vmi, vp->insert);
mm->map_count++;
}
@@ -287,15 +361,18 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
again:
- vma_mark_detached(vp->remove, true);
+ vma_mark_detached(vp->remove);
if (vp->file) {
uprobe_munmap(vp->remove, vp->remove->vm_start,
vp->remove->vm_end);
@@ -330,7 +407,7 @@ again:
*/
static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
{
- init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+ init_multi_vma_prep(vp, vma, NULL);
}
/*
@@ -354,8 +431,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg)
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
bool can_merge_left)
{
- if (!vmg->next || vmg->end != vmg->next->vm_start ||
- !can_vma_merge_before(vmg))
+ struct vm_area_struct *next = vmg->next;
+ struct vm_area_struct *prev;
+
+ if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
return false;
if (!can_merge_left)
@@ -368,23 +447,22 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
*
* We therefore check this in addition to mergeability to either side.
*/
- return are_anon_vmas_compatible(vmg->prev, vmg->next);
+ prev = vmg->prev;
+ return !prev->anon_vma || !next->anon_vma ||
+ prev->anon_vma == next->anon_vma;
}
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma)
{
might_sleep();
vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- if (unreachable)
- __vm_area_free(vma);
- else
- vm_area_free(vma);
+ vm_area_free(vma);
}
/*
@@ -398,7 +476,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
@@ -415,8 +492,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
* has already been checked or doesn't make sense to fail.
* VMA Iterator will point to the original VMA.
*/
-static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static __must_check int
+__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
struct vma_prepare vp;
struct vm_area_struct *new;
@@ -467,7 +545,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -511,39 +596,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * vma has some anon_vma assigned, and is already inserted on that
- * anon_vma's interval trees.
- *
- * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
- * vma must be removed from the anon_vma's interval trees using
- * anon_vma_interval_tree_pre_update_vma().
- *
- * After the update, the vma will be reinserted using
- * anon_vma_interval_tree_post_update_vma().
+ * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
+ * instance that the destination VMA has no anon_vma but the source does.
*
- * The entire update must be protected by exclusive mmap_lock and by
- * the root anon_vma's mutex.
- */
-void
-anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
-}
-
-void
-anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
-}
-
-/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
* @dst: The destination VMA
* @src: The source VMA
* @dup: Pointer to the destination VMA when successful.
@@ -554,9 +609,18 @@ static int dup_anon_vma(struct vm_area_struct *dst,
struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
+ * There are three cases to consider for correctly propagating
+ * anon_vma's on merge.
+ *
+ * The first is trivial - neither VMA has anon_vma, we need not do
+ * anything.
+ *
+ * The second where both have anon_vma is also a no-op, as they must
+ * then be the same, so there is simply nothing to copy.
+ *
+ * Here we cover the third - if the destination VMA has no anon_vma,
+ * that is it is unfaulted, we need to ensure that the newly merged
+ * range is referenced by the anon_vma's of the source.
*/
if (src->anon_vma && !dst->anon_vma) {
int ret;
@@ -629,49 +693,75 @@ void validate_mm(struct mm_struct *mm)
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
-/* Actually perform the VMA merge operation. */
-static int commit_merge(struct vma_merge_struct *vmg,
- struct vm_area_struct *adjust,
- struct vm_area_struct *remove,
- struct vm_area_struct *remove2,
- long adj_start,
- bool expanded)
+/*
+ * Based on the vmg flag indicating whether we need to adjust the vm_start field
+ * for the middle or next VMA, we calculate what the range of the newly adjusted
+ * VMA ought to be, and set the VMA's range accordingly.
+ */
+static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
{
- struct vma_prepare vp;
+ struct vm_area_struct *adjust;
+ pgoff_t pgoff;
- init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
+ if (vmg->__adjust_middle_start) {
+ adjust = vmg->middle;
+ pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
+ } else if (vmg->__adjust_next_start) {
+ adjust = vmg->next;
+ pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
+ } else {
+ return;
+ }
- VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
- vp.anon_vma != adjust->anon_vma);
+ vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
+}
- if (expanded) {
- /* Note: vma iterator must be pointing to 'start'. */
- vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+/*
+ * Actually perform the VMA merge operation.
+ *
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
+ * Returns 0 on success, or an error value on failure.
+ */
+static int commit_merge(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma;
+ struct vma_prepare vp;
+
+ if (vmg->__adjust_next_start) {
+ /* We manipulate middle and adjust next, which is the target. */
+ vma = vmg->middle;
+ vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
} else {
- vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
- adjust->vm_end);
+ vma = vmg->target;
+ /* Note: vma iterator must be pointing to 'start'. */
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
}
- if (vma_iter_prealloc(vmg->vmi, vmg->vma))
+ init_multi_vma_prep(&vp, vma, vmg);
+
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
+ if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
vma_prepare(&vp);
- vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
- vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
-
- if (expanded)
- vma_iter_store(vmg->vmi, vmg->vma);
-
- if (adj_start) {
- adjust->vm_start += adj_start;
- adjust->vm_pgoff += PHYS_PFN(adj_start);
- if (adj_start < 0) {
- WARN_ON(expanded);
- vma_iter_store(vmg->vmi, adjust);
- }
- }
+ /*
+ * THP pages may need to do additional splits if we increase
+ * middle->vm_start.
+ */
+ vma_adjust_trans_huge(vma, vmg->start, vmg->end,
+ vmg->__adjust_middle_start ? vmg->middle : NULL);
+ vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
+ vmg_adjust_set_range(vmg);
+ vma_iter_store_overwrite(vmg->vmi, vmg->target);
- vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
+ vma_complete(&vp, vmg->vmi, vma->vm_mm);
return 0;
}
@@ -694,8 +784,9 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* identical properties.
*
* This function checks for the existence of any such mergeable VMAs and updates
- * the maple tree describing the @vmg->vma->vm_mm address space to account for
- * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ * the maple tree describing the @vmg->middle->vm_mm address space to account
+ * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
+ * merge.
*
* As part of this operation, if a merge occurs, the @vmg object will have its
* vma, start, end, and pgoff fields modified to execute the merge. Subsequent
@@ -704,43 +795,43 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* Returns: The merged VMA if merge succeeds, or NULL otherwise.
*
* ASSUMPTIONS:
- * - The caller must assign the VMA to be modifed to @vmg->vma.
+ * - The caller must assign the VMA to be modifed to @vmg->middle.
* - The caller must have set @vmg->prev to the previous VMA, if there is one.
* - The caller must not set @vmg->next, as we determine this.
* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
- * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
+ * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
*/
-static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
+static __must_check struct vm_area_struct *vma_merge_existing_range(
+ struct vma_merge_struct *vmg)
{
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *prev = vmg->prev;
- struct vm_area_struct *next, *res;
+ struct vm_area_struct *next;
struct vm_area_struct *anon_dup = NULL;
- struct vm_area_struct *adjust = NULL;
unsigned long start = vmg->start;
unsigned long end = vmg->end;
- bool left_side = vma && start == vma->vm_start;
- bool right_side = vma && end == vma->vm_end;
+ bool left_side = middle && start == middle->vm_start;
+ bool right_side = middle && end == middle->vm_end;
int err = 0;
- long adj_start = 0;
- bool merge_will_delete_vma, merge_will_delete_next;
bool merge_left, merge_right, merge_both;
- bool expanded;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
- VM_WARN_ON(vmg->next); /* We set this. */
- VM_WARN_ON(prev && start <= prev->vm_start);
- VM_WARN_ON(start >= end);
+ VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
+ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
+ VM_WARN_ON_VMG(start >= end, vmg);
+
/*
- * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+ * If middle == prev, then we are offset into a VMA. Otherwise, if we are
* not, we must span a portion of the VMA.
*/
- VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
- vmg->end > vma->vm_end));
- /* The vmi must be positioned within vmg->vma. */
- VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
- vma_iter_addr(vmg->vmi) < vma->vm_end));
+ VM_WARN_ON_VMG(middle &&
+ ((middle != prev && vmg->start != middle->vm_start) ||
+ vmg->end > middle->vm_end), vmg);
+ /* The vmi must be positioned within vmg->middle. */
+ VM_WARN_ON_VMG(middle &&
+ !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
+ vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -774,49 +865,52 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
merge_both = merge_left && merge_right;
/* If we span the entire VMA, a merge implies it will be deleted. */
- merge_will_delete_vma = left_side && right_side;
+ vmg->__remove_middle = left_side && right_side;
/*
- * If we need to remove vma in its entirety but are unable to do so,
+ * If we need to remove middle in its entirety but are unable to do so,
* we have no sensible recourse but to abort the merge.
*/
- if (merge_will_delete_vma && !can_merge_remove_vma(vma))
+ if (vmg->__remove_middle && !can_merge_remove_vma(middle))
return NULL;
/*
* If we merge both VMAs, then next is also deleted. This implies
* merge_will_delete_vma also.
*/
- merge_will_delete_next = merge_both;
+ vmg->__remove_next = merge_both;
/*
* If we cannot delete next, then we can reduce the operation to merging
- * prev and vma (thereby deleting vma).
+ * prev and middle (thereby deleting middle).
*/
- if (merge_will_delete_next && !can_merge_remove_vma(next)) {
- merge_will_delete_next = false;
+ if (vmg->__remove_next && !can_merge_remove_vma(next)) {
+ vmg->__remove_next = false;
merge_right = false;
merge_both = false;
}
- /* No matter what happens, we will be adjusting vma. */
- vma_start_write(vma);
-
- if (merge_left)
- vma_start_write(prev);
+ /* No matter what happens, we will be adjusting middle. */
+ vma_start_write(middle);
- if (merge_right)
+ if (merge_right) {
vma_start_write(next);
+ vmg->target = next;
+ }
+
+ if (merge_left) {
+ vma_start_write(prev);
+ vmg->target = prev;
+ }
if (merge_both) {
/*
- * |<----->|
- * |-------*********-------|
- * prev vma next
- * extend delete delete
+ * |<-------------------->|
+ * |-------********-------|
+ * prev middle next
+ * extend delete delete
*/
- vmg->vma = prev;
vmg->start = prev->vm_start;
vmg->end = next->vm_end;
vmg->pgoff = prev->vm_pgoff;
@@ -824,97 +918,77 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
/*
* We already ensured anon_vma compatibility above, so now it's
* simply a case of, if prev has no anon_vma object, which of
- * next or vma contains the anon_vma we must duplicate.
+ * next or middle contains the anon_vma we must duplicate.
*/
- err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+ err = dup_anon_vma(prev, next->anon_vma ? next : middle,
+ &anon_dup);
} else if (merge_left) {
/*
- * |<----->| OR
- * |<--------->|
+ * |<------------>| OR
+ * |<----------------->|
* |-------*************
- * prev vma
+ * prev middle
* extend shrink/delete
*/
- vmg->vma = prev;
vmg->start = prev->vm_start;
vmg->pgoff = prev->vm_pgoff;
- if (!merge_will_delete_vma) {
- adjust = vma;
- adj_start = vmg->end - vma->vm_start;
- }
+ if (!vmg->__remove_middle)
+ vmg->__adjust_middle_start = true;
- err = dup_anon_vma(prev, vma, &anon_dup);
+ err = dup_anon_vma(prev, middle, &anon_dup);
} else { /* merge_right */
/*
- * |<----->| OR
- * |<--------->|
+ * |<------------->| OR
+ * |<----------------->|
* *************-------|
- * vma next
+ * middle next
* shrink/delete extend
*/
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
- VM_WARN_ON(!merge_right);
- /* If we are offset into a VMA, then prev must be vma. */
- VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+ VM_WARN_ON_VMG(!merge_right, vmg);
+ /* If we are offset into a VMA, then prev must be middle. */
+ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);
- if (merge_will_delete_vma) {
- vmg->vma = next;
+ if (vmg->__remove_middle) {
vmg->end = next->vm_end;
vmg->pgoff = next->vm_pgoff - pglen;
} else {
- /*
- * We shrink vma and expand next.
- *
- * IMPORTANT: This is the ONLY case where the final
- * merged VMA is NOT vmg->vma, but rather vmg->next.
- */
-
- vmg->start = vma->vm_start;
+ /* We shrink middle and expand next. */
+ vmg->__adjust_next_start = true;
+ vmg->start = middle->vm_start;
vmg->end = start;
- vmg->pgoff = vma->vm_pgoff;
-
- adjust = next;
- adj_start = -(vma->vm_end - start);
+ vmg->pgoff = middle->vm_pgoff;
}
- err = dup_anon_vma(next, vma, &anon_dup);
+ err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- /*
- * In nearly all cases, we expand vmg->vma. There is one exception -
- * merge_right where we partially span the VMA. In this case we shrink
- * the end of vmg->vma and adjust the start of vmg->next accordingly.
- */
- expanded = !merge_right || merge_will_delete_vma;
-
- if (commit_merge(vmg, adjust,
- merge_will_delete_vma ? vma : NULL,
- merge_will_delete_next ? next : NULL,
- adj_start, expanded)) {
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
- res = merge_left ? prev : next;
- khugepaged_enter_vma(res, vmg->flags);
-
+ khugepaged_enter_vma(vmg->target, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
- return res;
+ return vmg->target;
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -968,12 +1042,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
struct vm_area_struct *next = vmg->next;
unsigned long end = vmg->end;
bool can_merge_left, can_merge_right;
- bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON(vmg->vma);
+ VM_WARN_ON_VMG(vmg->middle, vmg);
/* vmi must point at or before the gap. */
- VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
+ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -982,18 +1055,18 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
return NULL;
can_merge_left = can_vma_merge_left(vmg);
- can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
+ can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
vmg->end = next->vm_end;
- vmg->vma = next;
+ vmg->middle = next;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
if (can_merge_left) {
vmg->start = prev->vm_start;
- vmg->vma = prev;
+ vmg->middle = prev;
vmg->pgoff = prev->vm_pgoff;
/*
@@ -1005,7 +1078,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
vmg->end = end;
/* In expand-only case we are already positioned at prev. */
- if (!just_expand) {
+ if (!vmg->just_expand) {
/* Equivalent to going to the previous range. */
vma_prev(vmg->vmi);
}
@@ -1015,10 +1088,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* Now try to expand adjacent VMA(s). This takes care of removing the
* following VMA if we have VMAs on both sides.
*/
- if (vmg->vma && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->vma, vmg->flags);
+ if (vmg->middle && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->middle, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
- return vmg->vma;
+ return vmg->middle;
}
return NULL;
@@ -1030,53 +1103,68 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* @vmg: Describes a VMA expansion operation.
*
* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
- * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
- * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
+ * Will expand over vmg->next if it's different from vmg->middle and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with
* vmg->next needs to be handled by the caller.
*
* Returns: 0 on success.
*
* ASSUMPTIONS:
- * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
- * - The caller must have set @vmg->vma and @vmg->next.
+ * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock.
+ * - The caller must have set @vmg->middle and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *next = vmg->next;
mmap_assert_write_locked(vmg->mm);
- vma_start_write(vma);
- if (next && (vma != next) && (vmg->end == next->vm_end)) {
+ vma_start_write(middle);
+ if (next && (middle != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
/* This should already have been checked by this point. */
- VM_WARN_ON(!can_merge_remove_vma(next));
+ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
- ret = dup_anon_vma(vma, next, &anon_dup);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
+ ret = dup_anon_vma(middle, next, &anon_dup);
if (ret)
return ret;
}
/* Not merging but overwriting any part of next is not handled. */
- VM_WARN_ON(next && !remove_next &&
- next != vma && vmg->end > next->vm_start);
+ VM_WARN_ON_VMG(next && !remove_next &&
+ next != middle && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
+ VM_WARN_ON_VMG(middle->vm_start < vmg->start ||
+ middle->vm_end > vmg->end, vmg);
+
+ vmg->target = middle;
+ if (remove_next)
+ vmg->__remove_next = true;
- if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
+ if (commit_merge(vmg))
goto nomem;
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1108,7 +1196,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
+ vma_adjust_trans_huge(vma, start, end, NULL);
vma_iter_clear(vmi);
vma_set_range(vma, start, end, pgoff);
@@ -1130,7 +1218,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- lru_add_drain();
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
@@ -1198,7 +1285,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, /* unreachable = */ false);
+ remove_vma(vma);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -1220,7 +1307,7 @@ static void reattach_vmas(struct ma_state *mas_detach)
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- vma_mark_detached(vma, false);
+ vma_mark_attached(vma);
__mt_destroy(mas_detach->tree);
}
@@ -1295,7 +1382,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
if (error)
goto munmap_gather_failed;
- vma_mark_detached(next, true);
+ vma_mark_detached(next);
nrpages = vma_pages(next);
vms->nr_pages += nrpages;
@@ -1507,25 +1594,36 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
*/
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *vma = vmg->middle;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
+ if (vmg_nomem(vmg))
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
/* Split any preceding portion of the VMA. */
- if (vma->vm_start < vmg->start) {
- int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+ if (vma->vm_start < start) {
+ int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
- if (vma->vm_end > vmg->end) {
- int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+ if (vma->vm_end > end) {
+ int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);
@@ -1583,12 +1681,15 @@ struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
vmg.flags = new_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
@@ -1604,7 +1705,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
vmg.next = vma_iter_next_rewind(vmi, NULL);
- vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
+ vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */
return vma_merge_new_range(&vmg);
}
@@ -1689,7 +1790,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
return -ENOMEM;
vma_start_write(vma);
- vma_iter_store(&vmi, vma);
+ vma_iter_store_new(&vmi, vma);
vma_link_file(vma);
mm->map_count++;
validate_mm(mm);
@@ -1721,11 +1822,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- vmg.vma = NULL; /* New VMA range. */
+ vmg.middle = NULL; /* New VMA range. */
vmg.pgoff = pgoff;
vmg.next = vma_iter_next_rewind(&vmi, NULL);
new_vma = vma_merge_new_range(&vmg);
@@ -1772,6 +1881,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
@@ -2288,6 +2398,10 @@ static int __mmap_new_file_vma(struct mmap_state *map,
int error;
vma->vm_file = get_file(map->file);
+
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2310,8 +2424,6 @@ static int __mmap_new_file_vma(struct mmap_state *map,
!(map->flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
- /* If the flags change (and are mergeable), let's retry later. */
- map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
map->flags = vma->vm_flags;
return 0;
@@ -2344,7 +2456,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2368,7 +2480,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
- vma_iter_store(vmi, vma);
+ vma_iter_store_new(vmi, vma);
map->mm->map_count++;
vma_link_file(vma);
@@ -2376,7 +2488,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
- khugepaged_enter_vma(vma, map->flags);
+ if (!vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma, map->flags);
ksm_add_vma(vma);
*vmap = vma;
return 0;
@@ -2430,17 +2543,70 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
-unsigned long __mmap_region(struct file *file, unsigned long addr,
+/*
+ * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values.
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_mmap_prepare(struct mmap_state *map)
+{
+ int err;
+ struct vm_area_desc desc = {
+ .mm = map->mm,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .file = map->file,
+ .vm_flags = map->flags,
+ .page_prot = map->page_prot,
+ };
+
+ /* Invoke the hook. */
+ err = __call_mmap_prepare(map->file, &desc);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = desc.pgoff;
+ map->file = desc.file;
+ map->flags = desc.vm_flags;
+ map->page_prot = desc.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = desc.vm_ops;
+ map->vm_private_data = desc.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ if (map->vm_ops)
+ vma->vm_ops = map->vm_ops;
+ vma->vm_private_data = map->vm_private_data;
+}
+
+static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
int error;
+ bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
error = __mmap_prepare(&map, uf);
+ if (!error && have_mmap_prepare)
+ error = call_mmap_prepare(&map);
if (error)
goto abort_munmap;
@@ -2458,13 +2624,8 @@ unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
- /* If flags changed, we might be able to merge, so try again. */
- if (map.retry_merge) {
- VMG_MMAP_STATE(vmg, &map, vma);
-
- vma_iter_config(map.vmi, map.addr, map.end);
- vma_merge_existing_range(&vmg);
- }
+ if (have_mmap_prepare)
+ set_vma_user_defined_fields(vma, &map);
__mmap_complete(&map, vma);
@@ -2478,3 +2639,518 @@ abort_munmap:
vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
return error;
}
+
+/**
+ * mmap_region() - Actually perform the userland mapping of a VMA into
+ * current->mm with known, aligned and overflow-checked @addr and @len, and
+ * correctly determined VMA flags @vm_flags and page offset @pgoff.
+ *
+ * This is an internal memory management function, and should not be used
+ * directly.
+ *
+ * The caller must write-lock current->mm->mmap_lock.
+ *
+ * @file: If a file-backed mapping, a pointer to the struct file describing the
+ * file to be mapped, otherwise NULL.
+ * @addr: The page-aligned address at which to perform the mapping.
+ * @len: The page-aligned, non-zero, length of the mapping.
+ * @vm_flags: The VMA flags which should be applied to the mapping.
+ * @pgoff: If @file is specified, the page offset into the file, if not then
+ * the virtual page offset in memory of the anonymous mapping.
+ * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
+ * events.
+ *
+ * Returns: Either an error, or the address at which the requested mapping has
+ * been performed.
+ */
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ unsigned long ret;
+ bool writable_file_mapping = false;
+
+ mmap_assert_write_locked(current->mm);
+
+ /* Check to see if MDWE is applicable. */
+ if (map_deny_write_exec(vm_flags, vm_flags))
+ return -EACCES;
+
+ /* Allow architectures to sanity-check the vm_flags. */
+ if (!arch_validate_flags(vm_flags))
+ return -EINVAL;
+
+ /* Map writable and ensure this isn't a sealed memfd. */
+ if (file && is_shared_maywrite(vm_flags)) {
+ int error = mapping_map_writable(file->f_mapping);
+
+ if (error)
+ return error;
+ writable_file_mapping = true;
+ }
+
+ ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
+
+ /* Clear our write mapping regardless of error. */
+ if (writable_file_mapping)
+ mapping_unmap_writable(file->f_mapping);
+
+ validate_mm(current->mm);
+ return ret;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @vmi: The vma iterator
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ /*
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
+ */
+ if (vma && vma->vm_end == addr) {
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+
+ vmg.prev = vma;
+ /* vmi is positioned at prev, which this mode expects. */
+ vmg.just_expand = true;
+
+ if (vma_merge_new_range(&vmg))
+ goto out;
+ else if (vmg_nomem(&vmg))
+ goto unacct_fail;
+ }
+
+ if (vma)
+ vma_iter_next_range(vmi);
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto unacct_fail;
+
+ vma_set_anonymous(vma);
+ vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
+ vm_flags_init(vma, flags);
+ vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_start_write(vma);
+ if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
+ validate_mm(mm);
+ ksm_add_vma(vma);
+out:
+ perf_event_mmap(vma);
+ mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
+ if (flags & VM_LOCKED)
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+unacct_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+}
+
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ unsigned long length, gap;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask + info->start_gap;
+ if (length < info->length)
+ return -ENOMEM;
+
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
+ return -ENOMEM;
+
+ /*
+ * Adjust for the gap first so it doesn't interfere with the
+ * later alignment. The first step is the minimum needed to
+ * fulill the start gap, the next steps is the minimum to align
+ * that. It is the minimum needed to fulill both.
+ */
+ gap = vma_iter_addr(&vmi) + info->start_gap;
+ gap += (info->align_offset - gap) & info->align_mask;
+ tmp = vma_next(&vmi);
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap + length - 1) {
+ low_limit = tmp->vm_end;
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ } else {
+ tmp = vma_prev(&vmi);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ low_limit = vm_end_gap(tmp);
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ }
+
+ return gap;
+}
+
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ unsigned long length, gap, gap_end;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask + info->start_gap;
+ if (length < info->length)
+ return -ENOMEM;
+
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
+ return -ENOMEM;
+
+ gap = vma_iter_end(&vmi) - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ gap_end = vma_iter_end(&vmi);
+ tmp = vma_next(&vmi);
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap_end) {
+ high_limit = vm_start_gap(tmp);
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ } else {
+ tmp = vma_prev(&vmi);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ high_limit = tmp->vm_start;
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ }
+
+ return gap;
+}
+
+/*
+ * Verify that the stack growth is acceptable and
+ * update accounting. This is shared with both the
+ * grow-up and grow-down cases.
+ */
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long new_start;
+
+ /* address space limit tests */
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
+ return -ENOMEM;
+
+ /* Stack limit test */
+ if (size > rlimit(RLIMIT_STACK))
+ return -ENOMEM;
+
+ /* mlock limit tests */
+ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ return -ENOMEM;
+
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
+ /*
+ * Overcommit.. This must be the final test, as it will
+ * update security statistics.
+ */
+ if (security_vm_enough_memory_mm(mm, grow))
+ return -ENOMEM;
+
+ return 0;
+}
+
+#if defined(CONFIG_STACK_GROWSUP)
+/*
+ * PA-RISC uses this for its stack.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
+ int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
+
+ if (!(vma->vm_flags & VM_GROWSUP))
+ return -EFAULT;
+
+ mmap_assert_write_locked(mm);
+
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= (TASK_SIZE & PAGE_MASK))
+ return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
+ if (next)
+ vma_iter_prev_range_limit(&vmi, address);
+
+ vma_iter_config(&vmi, vma->vm_start, address);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma))) {
+ vma_iter_free(&vmi);
+ return -ENOMEM;
+ }
+
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
+ /* We update the anon VMA tree. */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address > vma->vm_end) {
+ unsigned long size, grow;
+
+ size = address - vma->vm_start;
+ grow = (address - vma->vm_end) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_iter_store_overwrite(&vmi, vma);
+ anon_vma_interval_tree_post_update_vma(vma);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ vma_iter_free(&vmi);
+ validate_mm(mm);
+ return error;
+}
+#endif /* CONFIG_STACK_GROWSUP */
+
+/*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ * mmap_lock held for writing.
+ */
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
+
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return -EFAULT;
+
+ mmap_assert_write_locked(mm);
+
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
+ prev = vma_prev(&vmi);
+ /* Check that both stack segments have the same anon_vma? */
+ if (prev) {
+ if (!(prev->vm_flags & VM_GROWSDOWN) &&
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
+ }
+
+ if (prev)
+ vma_iter_next_range_limit(&vmi, vma->vm_start);
+
+ vma_iter_config(&vmi, address, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma))) {
+ vma_iter_free(&vmi);
+ return -ENOMEM;
+ }
+
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
+ /* We update the anon VMA tree. */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address < vma->vm_start) {
+ unsigned long size, grow;
+
+ size = vma->vm_end - address;
+ grow = (vma->vm_start - address) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_iter_store_overwrite(&vmi, vma);
+ anon_vma_interval_tree_post_update_vma(vma);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ vma_iter_free(&vmi);
+ validate_mm(mm);
+ return error;
+}
+
+int __vm_munmap(unsigned long start, size_t len, bool unlock)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, start);
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
+ if (ret || !unlock)
+ mmap_write_unlock(mm);
+
+ userfaultfd_unmap_complete(mm, &uf);
+ return ret;
+}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long charged = vma_pages(vma);
+
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+ return -ENOMEM;
+
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ if (vma_link(mm, vma)) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/mm/vma.h b/mm/vma.h
index 388d34748674..f47112a352db 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -58,35 +60,96 @@ enum vma_merge_state {
VMA_MERGE_SUCCESS,
};
-enum vma_merge_flags {
- VMG_FLAG_DEFAULT = 0,
- /*
- * If we can expand, simply do so. We know there is nothing to merge to
- * the right. Does not reset state upon failure to merge. The VMA
- * iterator is assumed to be positioned at the previous VMA, rather than
- * at the gap.
- */
- VMG_FLAG_JUST_EXPAND = 1 << 0,
-};
-
-/* Represents a VMA merge operation. */
+/*
+ * Describes a VMA merge operation and is threaded throughout it.
+ *
+ * Any of the fields may be mutated by the merge operation, so no guarantees are
+ * made to the contents of this structure after a merge operation has completed.
+ */
struct vma_merge_struct {
struct mm_struct *mm;
struct vma_iterator *vmi;
- pgoff_t pgoff;
+ /*
+ * Adjacent VMAs, any of which may be NULL if not present:
+ *
+ * |------|--------|------|
+ * | prev | middle | next |
+ * |------|--------|------|
+ *
+ * middle may not yet exist in the case of a proposed new VMA being
+ * merged, or it may be an existing VMA.
+ *
+ * next may be assigned by the caller.
+ */
struct vm_area_struct *prev;
- struct vm_area_struct *next; /* Modified by vma_merge(). */
- struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+ struct vm_area_struct *middle;
+ struct vm_area_struct *next;
+ /* This is the VMA we ultimately target to become the merged VMA. */
+ struct vm_area_struct *target;
+ /*
+ * Initially, the start, end, pgoff fields are provided by the caller
+ * and describe the proposed new VMA range, whether modifying an
+ * existing VMA (which will be 'middle'), or adding a new one.
+ *
+ * During the merge process these fields are updated to describe the new
+ * range _including those VMAs which will be merged_.
+ */
unsigned long start;
unsigned long end;
+ pgoff_t pgoff;
+
unsigned long flags;
struct file *file;
struct anon_vma *anon_vma;
struct mempolicy *policy;
struct vm_userfaultfd_ctx uffd_ctx;
struct anon_vma_name *anon_name;
- enum vma_merge_flags merge_flags;
enum vma_merge_state state;
+
+ /* Flags which callers can use to modify merge behaviour: */
+
+ /*
+ * If we can expand, simply do so. We know there is nothing to merge to
+ * the right. Does not reset state upon failure to merge. The VMA
+ * iterator is assumed to be positioned at the previous VMA, rather than
+ * at the gap.
+ */
+ bool just_expand :1;
+
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
+
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
+ /* Internal flags set during merge process: */
+
+ /*
+ * Internal flag indicating the merge increases vmg->middle->vm_start
+ * (and thereby, vmg->prev->vm_end).
+ */
+ bool __adjust_middle_start :1;
+ /*
+ * Internal flag indicating the merge decreases vmg->next->vm_start
+ * (and thereby, vmg->middle->vm_end).
+ */
+ bool __adjust_next_start :1;
+ /*
+ * Internal flag used during the merge operation to indicate we will
+ * remove vmg->middle.
+ */
+ bool __remove_middle :1;
+ /*
+ * Internal flag used during the merge operationr to indicate we will
+ * remove vmg->next.
+ */
+ bool __remove_next :1;
+
};
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
@@ -110,7 +173,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.flags = flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
@@ -118,8 +180,8 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.mm = vma_->vm_mm, \
.vmi = vmi_, \
.prev = prev_, \
+ .middle = vma_, \
.next = NULL, \
- .vma = vma_, \
.start = start_, \
.end = end_, \
.flags = vma_->vm_flags, \
@@ -130,7 +192,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
.anon_name = anon_vma_name(vma_), \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -139,15 +200,10 @@ void validate_mm(struct mm_struct *mm);
#define validate_mm(mm) do { } while (0)
#endif
-/* Required for expand_downwards(). */
-void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
-
-/* Required for expand_downwards(). */
-void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
-
-int vma_expand(struct vma_merge_struct *vmg);
-int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
+__must_check int vma_expand(struct vma_merge_struct *vmg);
+__must_check int vma_shrink(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff);
static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
struct vm_area_struct *vma, gfp_t gfp)
@@ -162,9 +218,57 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
+ vma_mark_attached(vma);
return 0;
}
+
+/*
+ * Temporary helper functions for file systems which wrap an invocation of
+ * f_op->mmap() but which might have an underlying file system which implements
+ * f_op->mmap_prepare().
+ */
+
+static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ desc->mm = vma->vm_mm;
+ desc->start = vma->vm_start;
+ desc->end = vma->vm_end;
+
+ desc->pgoff = vma->vm_pgoff;
+ desc->file = vma->vm_file;
+ desc->vm_flags = vma->vm_flags;
+ desc->page_prot = vma->vm_page_prot;
+
+ desc->vm_ops = NULL;
+ desc->private_data = NULL;
+
+ return desc;
+}
+
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ /*
+ * Since we're invoking .mmap_prepare() despite having a partially
+ * established VMA, we must take care to handle setting fields
+ * correctly.
+ */
+
+ /* Mutable fields. Populated with initial state. */
+ vma->vm_pgoff = desc->pgoff;
+ if (vma->vm_file != desc->file)
+ vma_set_file(vma, desc->file);
+ if (vma->vm_flags != desc->vm_flags)
+ vm_flags_set(vma, desc->vm_flags);
+ vma->vm_page_prot = desc->page_prot;
+
+ /* User-defined fields. */
+ vma->vm_ops = desc->vm_ops;
+ vma->vm_private_data = desc->private_data;
+}
+
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
@@ -174,19 +278,20 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
/* We are about to modify the VMA's flags. */
-struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+__must_check struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags);
/* We are about to modify the VMA's flags and/or anon_name. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
@@ -196,7 +301,7 @@ struct vm_area_struct
struct anon_vma_name *new_name);
/* We are about to modify the VMA's memory policy. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
@@ -204,19 +309,22 @@ struct vm_area_struct
struct mempolicy *new_pol);
/* We are about to modify the VMA's flags and/or uffd context. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
-struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
+__must_check struct vm_area_struct
+*vma_merge_new_range(struct vma_merge_struct *vmg);
-struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta);
+__must_check struct vm_area_struct
+*vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta);
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
@@ -243,10 +351,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);
-unsigned long __mmap_region(struct file *file, unsigned long addr,
+unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf);
+int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
+
+unsigned long unmapped_area(struct vm_unmapped_area_info *info);
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
+
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
/*
@@ -360,9 +474,10 @@ static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
}
/* Store a VMA with preallocated memory */
-static inline void vma_iter_store(struct vma_iterator *vmi,
- struct vm_area_struct *vma)
+static inline void vma_iter_store_overwrite(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
{
+ vma_assert_attached(vma);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
@@ -387,6 +502,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
mas_store_prealloc(&vmi->mas, vma);
}
+static inline void vma_iter_store_new(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+ vma_mark_attached(vma);
+ vma_iter_store_overwrite(vmi, vma);
+}
+
static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
return vmi->mas.index;
@@ -472,4 +594,27 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
#endif
+#if defined(CONFIG_STACK_GROWSUP)
+int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+#endif
+
+int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+
+int __vm_munmap(unsigned long start, size_t len, bool unlock);
+
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+
+/* vma_init.h, shared between CONFIG_MMU and nommu. */
+void __init vma_state_init(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
+void vm_area_free(struct vm_area_struct *vma);
+
+/* vma_exec.c */
+#ifdef CONFIG_MMU
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+#endif
+
#endif /* __MM_VMA_H */
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
new file mode 100644
index 000000000000..2dffb02ed6a2
--- /dev/null
+++ b/mm/vma_exec.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Functions explicitly implemented for exec functionality which however are
+ * explicitly VMA-only logic.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.middle = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
+ return -ENOMEM;
+
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
+
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p)
+{
+ int err;
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma_set_anonymous(vma);
+
+ if (mmap_write_lock_killable(mm)) {
+ err = -EINTR;
+ goto err_free;
+ }
+
+ /*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
+ * Place the stack at the largest stack address the architecture
+ * supports. Later, we'll move this to an appropriate place. We don't
+ * use STACK_TOP because that can depend on attributes which aren't
+ * configured yet.
+ */
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_end = STACK_TOP_MAX;
+ vma->vm_start = vma->vm_end - PAGE_SIZE;
+ vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ err = insert_vm_struct(mm, vma);
+ if (err)
+ goto err;
+
+ mm->stack_vm = mm->total_vm = 1;
+ mmap_write_unlock(mm);
+ *vmap = vma;
+ *top_mem_p = vma->vm_end - sizeof(void *);
+ return 0;
+
+err:
+ ksm_exit(mm);
+err_ksm:
+ mmap_write_unlock(mm);
+err_free:
+ *vmap = NULL;
+ vm_area_free(vma);
+ return err;
+}
diff --git a/mm/vma_init.c b/mm/vma_init.c
new file mode 100644
index 000000000000..8e53c7943561
--- /dev/null
+++ b/mm/vma_init.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+void __init vma_state_init(void)
+{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
+}
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+#ifdef __HAVE_PFNMAP_TRACKING
+ dest->pfnmap_track_ctx = NULL;
+#endif
+}
+
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return 0;
+
+ /*
+ * We don't expect to ever hit this. If ever required, we would have
+ * to duplicate the tracking.
+ */
+ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
+ return -ENOMEM;
+ kref_get(&ctx->kref);
+ new->pfnmap_track_ctx = ctx;
+ return 0;
+}
+
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+ struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return;
+
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ vma->pfnmap_track_ctx = NULL;
+}
+#else
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ return 0;
+}
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+}
+#endif
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+
+ if (vma_pfnmap_track_ctx_dup(orig, new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
+ }
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_pfnmap_track_ctx_release(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index fc5f172a36bd..2f05735ff190 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -35,6 +35,7 @@
#include <linux/mutex.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
+#include <linux/personality.h>
#include <linux/pfn.h>
#include <linux/rcupdate.h>
#include <linux/rmap.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7ed39d104201..ab986dd09b6a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
@@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -497,6 +524,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
@@ -510,6 +540,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -586,13 +618,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
@@ -900,6 +932,11 @@ static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
@@ -918,6 +955,19 @@ id_to_node(unsigned int id)
return &vmap_nodes[id % nr_vmap_nodes];
}
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
+
/*
* We use the value 0 to represent "no node", that is why
* an encoded value will be the node-id incremented by 1.
@@ -990,7 +1040,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
@@ -1056,12 +1107,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
- int i;
repeat:
- for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ va_start_lowest = 0;
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
@@ -1698,7 +1748,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
- return -1;
+ return -ENOMEM;
}
/*
@@ -1712,7 +1762,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
va->va_start = nva_start_addr + size;
} else {
- return -1;
+ return -EINVAL;
}
if (type != FL_FIT_TYPE) {
@@ -1741,19 +1791,19 @@ va_alloc(struct vmap_area *va,
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
- return vend;
+ return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
- return vend;
+ return ret;
return nva_start_addr;
}
/*
* Returns a start address of the newly allocated area, if success.
- * Otherwise a vend is returned that indicates failure.
+ * Otherwise an error value is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
@@ -1778,14 +1828,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
- return vend;
+ return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
- if (nva_start_addr == vend)
- return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(root, head, size, align);
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1915,7 +1964,7 @@ node_alloc(unsigned long size, unsigned long align,
struct vmap_area *va;
*vn_id = 0;
- *addr = vend;
+ *addr = -EINVAL;
/*
* Fallback to a global heap if not vmalloc or there
@@ -1940,7 +1989,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va_size(va);
+ vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -1995,20 +2044,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
}
retry:
- if (addr == vend) {
+ if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
}
- trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
/*
- * If an allocation fails, the "vend" address is
+ * If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (unlikely(addr == vend))
+ if (IS_ERR_VALUE(addr))
goto overflow;
va->va_start = addr;
@@ -2100,8 +2149,6 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
-
/*
* Serialize vmap purging. There is no actual critical section protected
* by this lock, but we want to avoid concurrent calls for performance
@@ -2111,7 +2158,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-static cpumask_t purge_nodes;
static void
reclaim_list_global(struct list_head *head)
@@ -2134,7 +2180,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
LIST_HEAD(decay_list);
struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- unsigned long n_decay;
+ unsigned long n_decay, pool_len;
int i;
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
@@ -2148,22 +2194,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
- if (full_decay)
- WRITE_ONCE(vn->pool[i].len, 0);
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
/* Decay a pool by ~25% out of left objects. */
- n_decay = vn->pool[i].len >> 2;
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
-
- if (!full_decay) {
- WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
-
- if (!--n_decay)
- break;
- }
}
/*
@@ -2172,9 +2216,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
* can populate the pool therefore a simple list replace
* operation takes place here.
*/
- if (!full_decay && !list_empty(&tmp_list)) {
+ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
@@ -2244,6 +2289,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
@@ -2255,9 +2301,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
*/
purge_nodes = CPU_MASK_NONE;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
-
+ for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
@@ -2276,7 +2320,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
- cpumask_set_cpu(i, &purge_nodes);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
@@ -2355,7 +2399,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
&vmap_lazy_nr);
/*
@@ -2421,7 +2465,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2447,7 +2491,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2916,10 +2960,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
*/
void vm_unmap_aliases(void)
{
- unsigned long start = ULONG_MAX, end = 0;
- int flush = 0;
-
- _vm_unmap_aliases(start, end, flush);
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -3100,7 +3141,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
@@ -3133,6 +3174,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size,
area->flags = flags;
area->caller = caller;
+ area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
@@ -3370,11 +3412,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3382,7 +3426,8 @@ void vfree(const void *addr)
__free_page(page);
cond_resched();
}
- atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
kvfree(vm->pages);
kfree(vm);
}
@@ -3560,11 +3605,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy_noprof(gfp,
+ nr = alloc_pages_bulk_mempolicy_noprof(gfp,
nr_pages_request,
pages + nr_allocated);
else
- nr = alloc_pages_bulk_array_node_noprof(gfp, nid,
+ nr = alloc_pages_bulk_node_noprof(gfp, nid,
nr_pages_request,
pages + nr_allocated);
@@ -3669,12 +3714,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (gfp_mask & __GFP_ACCOUNT) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
- }
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
/*
* If not enough pages were obtained to accomplish an
@@ -3769,8 +3812,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
struct vm_struct *area;
void *ret;
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
- unsigned long real_size = size;
- unsigned long real_align = align;
+ unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
@@ -3779,7 +3821,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
if ((size >> PAGE_SHIFT) > totalram_pages()) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
- real_size);
+ size);
return NULL;
}
@@ -3796,19 +3838,18 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
else
shift = arch_vmap_pte_supported_shift(size);
- align = max(real_align, 1UL << shift);
- size = ALIGN(real_size, 1UL << shift);
+ align = max(original_align, 1UL << shift);
}
again:
- area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed%s",
- real_size, (nofail) ? ". Retrying." : "");
+ size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
@@ -3858,7 +3899,7 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -3867,17 +3908,15 @@ again:
*/
clear_vm_uninitialized_flag(area);
- size = PAGE_ALIGN(size);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
- kmemleak_vmalloc(area, size, gfp_mask);
+ kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);
return area->addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
- align = real_align;
- size = real_size;
+ align = original_align;
goto again;
}
@@ -3945,9 +3984,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3956,13 +3996,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4065,6 +4105,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
*/
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
size_t old_size = 0;
void *n;
@@ -4074,15 +4116,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
}
if (p) {
- struct vm_struct *vm;
-
vm = find_vm_area(p);
if (unlikely(!vm)) {
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
return NULL;
}
- old_size = get_vm_area_size(vm);
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
}
/*
@@ -4090,10 +4134,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out spare memory. */
- if (want_init_on_alloc(flags))
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
+ kasan_poison_vmalloc(p + size, old_size - size);
+ return (void *)p;
+ }
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
return (void *)p;
}
@@ -4915,39 +4975,37 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
- int i;
-
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
@@ -4963,11 +5021,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
- int i;
+ unsigned int *counters;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
@@ -4980,6 +5039,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -5014,7 +5078,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -5024,19 +5090,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
@@ -5088,7 +5149,7 @@ static void __init vmap_init_free_space(void)
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
- int i, n;
+ int i;
#if BITS_PER_LONG == 64
/*
@@ -5105,7 +5166,7 @@ static void vmap_init_nodes(void)
* set of cores. Therefore a per-domain purging is supposed to
* be added as well as a per-domain balancing.
*/
- n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
@@ -5120,8 +5181,7 @@ static void vmap_init_nodes(void)
}
#endif
- for (n = 0; n < nr_vmap_nodes; n++) {
- vn = &vmap_nodes[n];
+ for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
@@ -5142,15 +5202,13 @@ static void vmap_init_nodes(void)
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long count;
+ unsigned long count = 0;
struct vmap_node *vn;
- int i, j;
-
- for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ int i;
- for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
- count += READ_ONCE(vn->pool[j].len);
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
@@ -5159,10 +5217,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int i;
+ struct vmap_node *vn;
- for (i = 0; i < nr_vmap_nodes; i++)
- decay_va_pool_node(&vmap_nodes[i], true);
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
return SHRINK_STOP;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76378bc257e3..f8dfd2864bbf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -323,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -357,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -374,7 +399,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
+ /*
+ * If there are no reclaimable file-backed or anonymous pages,
+ * ensure zones with sufficient free pages are not skipped.
+ * This prevents zones like DMA32 from being ignored in reclaim
+ * scenarios where they can still help alleviate memory pressure.
+ */
+ if (nr == 0)
+ nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
return nr;
}
@@ -389,13 +421,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
unsigned long size = 0;
int zid;
+ struct zone *zone;
- for (zid = 0; zid <= zone_idx; zid++) {
- struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
@@ -434,21 +462,26 @@ void drop_slab(void)
} while ((freed >> shift++) > 1);
}
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type) \
+ do { \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGSCAN_##type - PGSCAN_KSWAPD); \
+ } while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
{
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+ CHECK_RECLAIMER_OFFSET(DIRECT);
+ CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+ CHECK_RECLAIMER_OFFSET(PROACTIVE);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ if (sc->proactive)
+ return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
@@ -488,7 +521,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
-
+ struct zone *zone;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
@@ -501,12 +534,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
@@ -626,21 +654,20 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int (*writeout)(struct folio *, struct writeback_control *);
+
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -663,7 +690,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
+ if (shmem_mapping(mapping))
+ writeout = shmem_writeout;
+ else if (folio_test_anon(folio))
+ writeout = swap_writeout;
+ else
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
@@ -686,7 +717,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
wbc.list = folio_list;
folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
+ res = writeout(folio, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
@@ -695,7 +726,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
+ /* synchronous write? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
@@ -762,7 +793,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
- mem_cgroup_swapout(folio, swap);
+ memcg1_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -855,6 +886,31 @@ enum folio_references {
FOLIOREF_ACTIVATE,
};
+#ifdef CONFIG_LRU_GEN
+/*
+ * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
+ * needs to be done by taking the folio off the LRU list and then adding it back
+ * with PG_active set. In contrast, the aging (page table walk) path uses
+ * folio_update_gen().
+ */
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return false;
+ }
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ return true;
+}
+#else
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
@@ -863,7 +919,6 @@ static enum folio_references folio_check_references(struct folio *folio,
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
- referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
@@ -881,6 +936,15 @@ static enum folio_references folio_check_references(struct folio *folio,
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
+ if (lru_gen_enabled()) {
+ if (!referenced_ptes)
+ return FOLIOREF_RECLAIM;
+
+ return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
+ }
+
+ referenced_folio = folio_test_clear_referenced(folio);
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -1041,12 +1105,13 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
@@ -1054,7 +1119,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1072,6 +1137,13 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1085,11 +1157,6 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- /* folio_update_gen() tried to promote this page? */
- if (lru_gen_enabled() && !ignore_references &&
- folio_mapped(folio) && folio_test_referenced(folio))
- goto keep_locked;
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1130,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1156,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1166,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1244,7 +1317,7 @@ retry:
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1260,9 +1333,21 @@ retry:
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (!add_to_swap(folio))
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
}
@@ -1515,8 +1600,9 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
- nr_reclaimed += stat->nr_demoted;
+ nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += nr_demoted;
+ stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -1588,7 +1674,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1655,12 +1741,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
+ unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -1671,9 +1756,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ /* Using max_nr_skipped to prevent hard LOCKUP*/
+ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
+ (folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
+ max_nr_skipped++;
goto move;
}
@@ -1946,10 +2034,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -1957,18 +2045,19 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2058,7 +2147,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2115,7 +2204,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2140,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2332,17 +2421,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
+ struct zone *zone;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
total_high_wmark += high_wmark_pages(zone);
}
@@ -2360,6 +2445,43 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
}
}
+static inline void calculate_pressure_balance(struct scan_control *sc,
+ int swappiness, u64 *fraction, u64 *denominator)
+{
+ unsigned long anon_cost, file_cost, total_cost;
+ unsigned long ap, fp;
+
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[WORKINGSET_ANON] = ap;
+ fraction[WORKINGSET_FILE] = fp;
+ *denominator = ap + fp;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2372,12 +2494,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- unsigned long anon_cost, file_cost, total_cost;
int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
- unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */
@@ -2398,6 +2518,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2418,7 +2545,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2426,35 +2554,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
- /*
- * Calculate the pressure balance between anon and file pages.
- *
- * The amount of pressure we put on each LRU is inversely
- * proportional to the cost of reclaiming each list, as
- * determined by the share of pages that are refaulting, times
- * the relative IO cost of bringing back a swapped out
- * anonymous page vs reloading a filesystem page (swappiness).
- *
- * Although we limit that influence to ensure no list gets
- * left behind completely: at least a third of the pressure is
- * applied, before swappiness.
- *
- * With swappiness at 100, anon and file have equal IO cost.
- */
- total_cost = sc->anon_cost + sc->file_cost;
- anon_cost = total_cost + sc->anon_cost;
- file_cost = total_cost + sc->file_cost;
- total_cost = anon_cost + file_cost;
-
- ap = swappiness * (total_cost + 1);
- ap /= anon_cost + 1;
-
- fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
- fp /= file_cost + 1;
+ calculate_pressure_balance(sc, swappiness, fraction, &denominator);
- fraction[0] = ap;
- fraction[1] = fp;
- denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
@@ -2568,7 +2669,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2576,7 +2677,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2612,11 +2714,21 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
+#define evictable_min_seq(min_seq, swappiness) \
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
+
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define for_each_evictable_type(type, swappiness) \
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
+
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2648,7 +2760,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -2662,10 +2774,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_folio */
- return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ int n = get_nr_gens(lruvec, type);
+
+ if (n < MIN_NR_GENS || n > MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
}
/******************************************************************************
@@ -3066,16 +3184,20 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
+ int i;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- pos->refaulted = lrugen->avg_refaulted[type][tier] +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- pos->total = lrugen->avg_total[type][tier] +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
+ pos->refaulted = pos->total = 0;
+
+ for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ pos->refaulted += lrugen->avg_refaulted[type][i] +
+ atomic_long_read(&lrugen->refaulted[hist][type][i]);
+ pos->total += lrugen->avg_total[type][i] +
+ lrugen->protected[hist][type][i] +
+ atomic_long_read(&lrugen->evicted[hist][type][i]);
+ }
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
@@ -3101,17 +3223,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
+ lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- if (tier)
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
}
}
}
@@ -3138,16 +3258,19 @@ static int folio_update_gen(struct folio *folio, int gen)
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return -1;
+ }
+
do {
/* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
+ if (!(old_flags & LRU_GEN_MASK))
+ return -1;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
+ new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
@@ -3171,7 +3294,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_gen = (old_gen + 1) % MAX_NR_GENS;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
/* for folio_end_writeback() */
if (reclaiming)
@@ -3246,7 +3369,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (vma_is_anonymous(vma))
- return !walk->can_swap;
+ return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
return true;
@@ -3256,7 +3379,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (shmem_mapping(mapping))
- return !walk->can_swap;
+ return !walk->swappiness;
+
+ if (walk->swappiness > MAX_SWAPPINESS)
+ return true;
/* to exclude special mappings like dax, etc. */
return !mapping->a_ops->read_folio;
@@ -3344,19 +3470,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
}
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
+ struct pglist_data *pgdat)
{
- struct folio *folio;
+ struct folio *folio = pfn_folio(pfn);
- folio = pfn_folio(pfn);
- if (folio_nid(folio) != pgdat->node_id)
+ if (folio_lru_gen(folio) < 0)
return NULL;
- if (folio_memcg(folio) != memcg)
+ if (folio_nid(folio) != pgdat->node_id)
return NULL;
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
+ if (folio_memcg(folio) != memcg)
return NULL;
return folio;
@@ -3370,29 +3494,55 @@ static bool suitable_to_scan(int total, int young)
return young * n >= total;
}
+static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int new_gen, bool dirty)
+{
+ int old_gen;
+
+ if (!folio)
+ return;
+
+ if (dirty && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ } else if (lru_gen_set_refs(folio)) {
+ old_gen = folio_lru_gen(folio);
+ if (old_gen >= 0 && old_gen != new_gen)
+ folio_activate(folio);
+ }
+}
+
static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
+ bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
int total = 0;
int young = 0;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
pmd_t pmdval;
- pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
- &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
if (!pte)
return false;
+
if (!spin_trylock(ptl)) {
pte_unmap(pte);
- return false;
+ return true;
}
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
@@ -3414,26 +3564,30 @@ restart:
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
continue;
- young++;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
+
+ last = folio;
+ dirty = false;
+ }
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (pte_dirty(ptent))
+ dirty = true;
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
}
+ walk_update_folio(walk, last, gen, dirty);
+ last = NULL;
+
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
@@ -3447,13 +3601,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
+ bool dirty;
pmd_t *pmd;
spinlock_t *ptl;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3499,27 +3655,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (pfn == -1)
goto next;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
goto next;
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pmd_dirty(pmd[i]))
+ dirty = true;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
@@ -3711,22 +3870,30 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
int zone;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type == LRU_GEN_ANON && !can_swap)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
- /* prevent cold/hot inversion if force_scan is true */
+ /* prevent cold/hot inversion if the type is evictable */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -3736,6 +3903,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
new_gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int tier = lru_tier_from_refs(refs, workingset);
+ int delta = folio_nr_pages(folio);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
+
if (!--remaining)
return false;
}
@@ -3747,7 +3923,7 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
@@ -3757,7 +3933,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
/* find the oldest populated generation */
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
@@ -3773,13 +3949,17 @@ next:
}
/* see the comment on lru_gen_folio */
- if (can_swap) {
- min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ if (swappiness && swappiness <= MAX_SWAPPINESS) {
+ unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
+
+ if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
+ min_seq[LRU_GEN_ANON] = seq;
+ else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
+ min_seq[LRU_GEN_FILE] = seq;
}
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
- if (min_seq[type] == lrugen->min_seq[type])
+ for_each_evictable_type(type, swappiness) {
+ if (min_seq[type] <= lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
@@ -3790,8 +3970,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
bool success;
int prev, next;
@@ -3809,13 +3988,11 @@ restart:
if (!success)
goto unlock;
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
-
- if (inc_min_seq(lruvec, type, can_swap))
+ if (inc_min_seq(lruvec, type, swappiness))
continue;
spin_unlock_irq(&lruvec->lru_lock);
@@ -3859,7 +4036,7 @@ unlock:
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3870,7 +4047,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */
if (seq <= READ_ONCE(mm_state->seq))
@@ -3895,7 +4072,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
walk->lruvec = lruvec;
walk->seq = seq;
- walk->can_swap = can_swap;
+ walk->swappiness = swappiness;
walk->force_scan = force_scan;
do {
@@ -3905,7 +4082,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
@@ -3946,13 +4123,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
unsigned long total = 0;
- bool can_swap = get_swappiness(lruvec, sc);
+ int swappiness = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
@@ -3972,6 +4149,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
{
int gen;
unsigned long birth;
+ int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
@@ -3981,8 +4159,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
if (!lruvec_is_sizable(lruvec, sc))
return false;
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return time_is_before_jiffies(birth + min_ttl);
@@ -4041,21 +4218,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
+ bool dirty;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
+ struct folio *last = NULL;
int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4102,37 +4280,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(vma, addr, pte + i))
continue;
- young++;
-
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
-
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- continue;
+ last = folio;
+ dirty = false;
}
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen) {
- folio_clear_lru_refs(folio);
- folio_activate(folio);
- }
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
}
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
/* feedback from rmap walkers to page table walkers */
@@ -4290,7 +4459,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4312,14 +4482,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-
+ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- lrugen->protected[hist][type][tier - 1] + delta);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
return true;
}
@@ -4339,8 +4512,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* waiting for writeback */
- if (folio_test_locked(folio) || writeback ||
- (type == LRU_GEN_FILE && dirty)) {
+ if (writeback || (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4369,13 +4541,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return false;
}
- /* see the comment on MAX_NR_TIERS */
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- folio_clear_lru_refs(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
- folio_clear_referenced(folio);
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4445,13 +4616,13 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4471,13 +4642,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
struct ctrl_pos sp, pv;
/*
- * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * To leave a margin for fluctuations, use a larger gain factor (2:3).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
- read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ read_ctrl_pos(lruvec, type, 0, 2, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ read_ctrl_pos(lruvec, type, tier, 3, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
@@ -4485,79 +4656,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
return tier - 1;
}
-static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
+ if (swappiness <= MIN_SWAPPINESS + 1)
+ return LRU_GEN_FILE;
+
+ if (swappiness >= MAX_SWAPPINESS)
+ return LRU_GEN_ANON;
/*
- * Compare the first tier of anon with that of file to determine which
- * type to scan. Also need to compare other tiers of the selected type
- * with the first tier of the other type to determine the last tier (of
- * the selected type) to evict.
+ * Compare the sum of all tiers of anon with that of file to determine
+ * which type to scan.
*/
- read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- type = positive_ctrl_err(&sp, &pv);
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
- read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- if (!positive_ctrl_err(&sp, &pv))
- break;
- }
-
- *tier_idx = tier - 1;
-
- return type;
+ return positive_ctrl_err(&sp, &pv);
}
static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
- int type;
- int scanned;
- int tier = -1;
- DEFINE_MIN_SEQ(lruvec);
+ int type = get_type_to_scan(lruvec, swappiness);
- /*
- * Try to make the obvious choice first, and if anon and file are both
- * available from the same generation,
- * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
- * first.
- * 2. If !__GFP_IO, file first since clean pagecache is more likely to
- * exist than clean swapcache.
- */
- if (!swappiness)
- type = LRU_GEN_FILE;
- else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- type = LRU_GEN_ANON;
- else if (swappiness == 1)
- type = LRU_GEN_FILE;
- else if (swappiness == MAX_SWAPPINESS)
- type = LRU_GEN_ANON;
- else if (!(sc->gfp_mask & __GFP_IO))
- type = LRU_GEN_FILE;
- else
- type = get_type_to_scan(lruvec, swappiness, &tier);
+ for_each_evictable_type(i, swappiness) {
+ int scanned;
+ int tier = get_tier_idx(lruvec, type);
- for (i = !swappiness; i < ANON_AND_FILE; i++) {
- if (tier < 0)
- tier = get_tier_idx(lruvec, type);
+ *type_scanned = type;
scanned = scan_folios(lruvec, sc, type, tier, list);
if (scanned)
- break;
+ return scanned;
type = !type;
- tier = -1;
}
- *type_scanned = type;
-
- return scanned;
+ return 0;
}
static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
@@ -4573,6 +4710,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
bool skip_retry = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -4582,7 +4720,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
scanned += try_to_inc_min_seq(lruvec, swappiness);
- if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
@@ -4590,7 +4728,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4598,31 +4736,24 @@ retry:
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ DEFINE_MIN_SEQ(lruvec);
+
if (!folio_evictable(folio)) {
list_del(&folio->lru);
folio_putback_lru(folio);
continue;
}
- if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
- continue;
- }
-
- if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
- folio_mapped(folio) || folio_test_locked(folio) ||
- folio_test_dirty(folio) || folio_test_writeback(folio)) {
- /* don't add rejected folios to the oldest generation */
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
- BIT(PG_active));
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, &clean);
continue;
}
- /* retry folios that may have missed folio_rotate_reclaimable() */
- list_move(&folio->lru, &clean);
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4635,10 +4766,13 @@ retry:
reset_batch_size(walk);
}
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+ stat.nr_demoted);
+
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4654,63 +4788,32 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- bool can_swap, unsigned long *nr_to_scan)
+ int swappiness, unsigned long *nr_to_scan)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
- unsigned long total = 0;
+ unsigned long size = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- /* whether this lruvec is completely out of cold folios */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
- *nr_to_scan = 0;
+ *nr_to_scan = 0;
+ /* have to run aging, since eviction is not possible anymore */
+ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- }
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
}
}
- *nr_to_scan = total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ *nr_to_scan = size;
+ /* better to run aging even though eviction is still possible */
+ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/*
@@ -4718,7 +4821,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
bool success;
unsigned long nr_to_scan;
@@ -4728,7 +4831,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
+ success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
/* try to scrape all its memory if this memcg was deleted */
if (nr_to_scan && !mem_cgroup_online(memcg))
@@ -4739,7 +4842,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return nr_to_scan >> sc->priority;
/* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -5283,8 +5386,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
}
for (i = 0; i < 3; i++)
@@ -5339,7 +5441,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, " node %5d\n", nid);
if (!full)
- seq = min_seq[LRU_GEN_ANON];
+ seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
else if (max_seq >= MAX_NR_GENS)
seq = max_seq - MAX_NR_GENS + 1;
else
@@ -5379,23 +5481,14 @@ static const struct seq_operations lru_gen_seq_ops = {
};
static int run_aging(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
-
- if (seq < max_seq)
- return 0;
if (seq > max_seq)
return -EINVAL;
- if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
- return -ERANGE;
-
- try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
-
- return 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
@@ -5411,7 +5504,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
while (!signal_pending(current)) {
DEFINE_MIN_SEQ(lruvec);
- if (seq < min_seq[!swappiness])
+ if (seq < evictable_min_seq(min_seq, swappiness))
return 0;
if (sc->nr_reclaimed >= nr_to_reclaim)
@@ -5456,7 +5549,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5513,24 +5606,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5790,7 +5894,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -5821,6 +5925,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
+ struct zone *zone;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -5840,17 +5945,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
- for (z = 0; z <= sc->reclaim_idx; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+ unsigned long watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, watermark,
+ sc->reclaim_idx))
return false;
}
@@ -6077,22 +6181,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
sc->reclaim_idx, 0))
return true;
- /* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
- return false;
-
/*
- * Compaction is already possible, but it takes time to run and there
- * are potentially other callers using the pages just freed. So proceed
- * with reclaim to make a buffer of free pages available to give
- * compaction a reasonable chance of completing and allocating the page.
+ * Direct reclaim usually targets the min watermark, but compaction
+ * takes time to run and there are potentially other callers using the
+ * pages just freed. So target a higher buffer to give compaction a
+ * reasonable chance of completing and allocating the pages.
+ *
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
- watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+ watermark = high_wmark_pages(zone);
+ if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+ return true;
- return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+ return false;
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
@@ -6371,11 +6474,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= ZONE_NORMAL; i++) {
- zone = &pgdat->node_zones[i];
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
if (!zone_reclaimable_pages(zone))
continue;
@@ -6626,10 +6725,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6680,17 +6779,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
+ unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+ /*
+ * In defrag_mode, watermarks must be met in whole
+ * blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
+ */
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
+ else
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
+
+ if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+ 0, free_pages))
return true;
}
@@ -6770,11 +6900,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
- for (z = 0; z <= sc->reclaim_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
@@ -6805,12 +6931,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
int i;
struct zone *zone;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
@@ -6871,11 +6992,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
@@ -7182,10 +7299,6 @@ static int kswapd(void *p)
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
@@ -7354,13 +7467,15 @@ void __meminit kswapd_run(int nid)
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
+ } else {
+ wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);
@@ -7384,6 +7499,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7391,6 +7528,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
@@ -7556,11 +7694,11 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4d016314a56c..429ae5339bfe 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,7 +7,7 @@
*
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
- * Christoph Lameter <christoph@lameter.com>
+ * Christoph Lameter <cl@gentwo.org>
* Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
@@ -31,8 +31,10 @@
#include "internal.h"
+#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NUMA
-int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+#define ENABLE_NUMA_STAT 1
+static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
/* zero numa counters within a zone */
static void zero_zone_numa_counters(struct zone *zone)
@@ -74,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
-int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
+static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -102,6 +104,7 @@ out:
return ret;
}
#endif
+#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1190,6 +1193,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
"nr_free_pages",
+ "nr_free_pages_blocks",
"nr_zone_inactive_anon",
"nr_zone_active_anon",
"nr_zone_inactive_file",
@@ -1197,7 +1201,6 @@ const char * const vmstat_text[] = {
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
- "nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
#endif
@@ -1273,9 +1276,11 @@ const char * const vmstat_text[] = {
"pgdemote_kswapd",
"pgdemote_direct",
"pgdemote_khugepaged",
+ "pgdemote_proactive",
#ifdef CONFIG_HUGETLB_PAGE
"nr_hugetlb",
#endif
+ "nr_balloon_pages",
/* system-wide enum vm_stat_item counters */
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -1307,9 +1312,11 @@ const char * const vmstat_text[] = {
"pgsteal_kswapd",
"pgsteal_direct",
"pgsteal_khugepaged",
+ "pgsteal_proactive",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_khugepaged",
+ "pgscan_proactive",
"pgscan_direct_throttle",
"pgscan_anon",
"pgscan_file",
@@ -1339,6 +1346,8 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
+ "numa_task_migrated",
+ "numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
@@ -1435,6 +1444,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
+ "direct_map_level2_collapses",
+ "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
"vma_lock_success",
@@ -1938,7 +1949,7 @@ static const struct seq_operations vmstat_op = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
+static int sysctl_stat_interval __read_mostly = HZ;
static int vmstat_late_init_done;
#ifdef CONFIG_PROC_FS
@@ -1947,7 +1958,7 @@ static void refresh_vm_stats(struct work_struct *work)
refresh_cpu_vm_stats(true);
}
-int vmstat_refresh(const struct ctl_table *table, int write,
+static int vmstat_refresh(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
@@ -2122,10 +2133,20 @@ static void __init start_shepherd_timer(void)
{
int cpu;
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
+ /*
+ * For secondary CPUs during CPU hotplug scenarios,
+ * vmstat_cpu_online() will enable the work.
+ * mm/vmstat:online enables and disables vmstat_work
+ * symmetrically during CPU hotplug events.
+ */
+ if (!cpu_online(cpu))
+ disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ }
+
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
@@ -2148,13 +2169,14 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
}
+ enable_delayed_work(&per_cpu(vmstat_work, cpu));
return 0;
}
static int vmstat_cpu_down_prep(unsigned int cpu)
{
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
return 0;
}
@@ -2185,6 +2207,38 @@ static int __init vmstat_late_init(void)
late_initcall(vmstat_late_init);
#endif
+#ifdef CONFIG_PROC_FS
+static const struct ctl_table vmstat_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+};
+#endif
+
struct workqueue_struct *mm_percpu_wq;
void __init init_mm_internals(void)
@@ -2216,6 +2270,7 @@ void __init init_mm_internals(void)
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+ register_sysctl_init("vm", vmstat_table);
#endif
}
diff --git a/mm/workingset.c b/mm/workingset.c
index a4705e196545..6e7f4cb1b9a7 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio)
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
@@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
}
/*
* Tests if the shadow entry is for a folio that was recently evicted.
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
*/
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
int memcg_id;
- unsigned long min_seq;
+ unsigned long max_seq;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
@@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
memcg = mem_cgroup_from_id(memcg_id);
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
- min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
- return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+ max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
+ max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
+
+ return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
@@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
rcu_read_lock();
- recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
if (lruvec != folio_lruvec(folio))
goto unlock;
@@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
lrugen = &lruvec->lrugen;
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
- /* see the comment in folio_lru_refs() */
- refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
- tier = lru_tier_from_refs(refs);
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
+ tier = lru_tier_from_refs(refs, workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
- /*
- * Count the following two cases as stalls:
- * 1. For pages accessed through page tables, hotter pages pushed out
- * hot pages which refaulted immediately.
- * 2. For pages accessed multiple times through file descriptors,
- * they would have been protected by sort_folio().
- */
- if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
- set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
+ /* see folio_add_lru() where folio_set_active() will be called */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ if (workingset) {
+ folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
- }
+ } else
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
@@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
return false;
@@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
struct pglist_data *pgdat;
unsigned long eviction;
- rcu_read_lock();
-
if (lru_gen_enabled()) {
- bool recent = lru_gen_test_recent(shadow, file,
- &eviction_lruvec, &eviction, workingset);
+ bool recent;
+ rcu_read_lock();
+ recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
rcu_read_unlock();
return recent;
}
-
+ rcu_read_lock();
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
@@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() &&
- (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) {
- rcu_read_unlock();
- return false;
- }
-
+ if (!mem_cgroup_tryget(eviction_memcg))
+ eviction_memcg = NULL;
rcu_read_unlock();
+ if (!mem_cgroup_disabled() && !eviction_memcg)
+ return false;
/*
* Flush stats (and potentially sleep) outside the RCU read section.
*
@@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow)
bool workingset;
long nr;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
if (lru_gen_enabled()) {
lru_gen_refault(folio, shadow);
return;
@@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow)
* is actually experiencing the refault event. Make sure the folio is
* locked to guarantee folio_memcg() stability throughout.
*/
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
pgdat = folio_pgdat(folio);
@@ -615,7 +612,6 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
struct page *page = virt_to_page(node);
/*
@@ -626,8 +622,7 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/z3fold.c b/mm/z3fold.c
deleted file mode 100644
index 379d24b4fef9..000000000000
--- a/mm/z3fold.c
+++ /dev/null
@@ -1,1447 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * z3fold.c
- *
- * Author: Vitaly Wool <vitaly.wool@konsulko.com>
- * Copyright (C) 2016, Sony Mobile Communications Inc.
- *
- * This implementation is based on zbud written by Seth Jennings.
- *
- * z3fold is an special purpose allocator for storing compressed pages. It
- * can store up to three compressed pages per page which improves the
- * compression ratio of zbud while retaining its main concepts (e. g. always
- * storing an integral number of objects per page) and simplicity.
- * It still has simple and deterministic reclaim properties that make it
- * preferable to a higher density approach (with no requirement on integral
- * number of object per page) when reclaim is used.
- *
- * As in zbud, pages are divided into "chunks". The size of the chunks is
- * fixed at compile time and is determined by NCHUNKS_ORDER below.
- *
- * z3fold doesn't export any API and is meant to be used via zpool API.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/page-flags.h>
-#include <linux/migrate.h>
-#include <linux/node.h>
-#include <linux/compaction.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/zpool.h>
-#include <linux/kmemleak.h>
-
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS)
-
-#define BUDDY_MASK (0x3)
-#define BUDDY_SHIFT 2
-#define SLOTS_ALIGN (0x40)
-
-/*****************
- * Structures
-*****************/
-struct z3fold_pool;
-
-enum buddy {
- HEADLESS = 0,
- FIRST,
- MIDDLE,
- LAST,
- BUDDIES_MAX = LAST
-};
-
-struct z3fold_buddy_slots {
- /*
- * we are using BUDDY_MASK in handle_to_buddy etc. so there should
- * be enough slots to hold all possible variants
- */
- unsigned long slot[BUDDY_MASK + 1];
- unsigned long pool; /* back link */
- rwlock_t lock;
-};
-#define HANDLE_FLAG_MASK (0x03)
-
-/*
- * struct z3fold_header - z3fold page metadata occupying first chunks of each
- * z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the
- * pool
- * @page_lock: per-page lock
- * @refcount: reference count for the z3fold page
- * @work: work_struct for page layout optimization
- * @slots: pointer to the structure holding buddy slots
- * @pool: pointer to the containing pool
- * @cpu: CPU which this page "belongs" to
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- * @first_num: the starting number (for the first handle)
- * @mapped_count: the number of objects currently mapped
- */
-struct z3fold_header {
- struct list_head buddy;
- spinlock_t page_lock;
- struct kref refcount;
- struct work_struct work;
- struct z3fold_buddy_slots *slots;
- struct z3fold_pool *pool;
- short cpu;
- unsigned short first_chunks;
- unsigned short middle_chunks;
- unsigned short last_chunks;
- unsigned short start_middle;
- unsigned short first_num:2;
- unsigned short mapped_count:2;
- unsigned short foreign_handles:2;
-};
-
-/**
- * struct z3fold_pool - stores metadata for each z3fold pool
- * @name: pool name
- * @lock: protects pool unbuddied lists
- * @stale_lock: protects pool stale page list
- * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
- * buddies; the list each z3fold page is added to depends on
- * the size of its free region.
- * @stale: list of pages marked for freeing
- * @pages_nr: number of z3fold pages in the pool.
- * @c_handle: cache for z3fold_buddy_slots allocation
- * @compact_wq: workqueue for page layout background optimization
- * @release_wq: workqueue for safe page release
- * @work: work_struct for safe page release
- *
- * This structure is allocated at pool creation time and maintains metadata
- * pertaining to a particular z3fold pool.
- */
-struct z3fold_pool {
- const char *name;
- spinlock_t lock;
- spinlock_t stale_lock;
- struct list_head __percpu *unbuddied;
- struct list_head stale;
- atomic64_t pages_nr;
- struct kmem_cache *c_handle;
- struct workqueue_struct *compact_wq;
- struct workqueue_struct *release_wq;
- struct work_struct work;
-};
-
-/*
- * Internal z3fold page flags
- */
-enum z3fold_page_flags {
- PAGE_HEADLESS = 0,
- MIDDLE_CHUNK_MAPPED,
- NEEDS_COMPACTING,
- PAGE_STALE,
- PAGE_CLAIMED, /* by either reclaim or free */
- PAGE_MIGRATED, /* page is migrated and soon to be released */
-};
-
-/*
- * handle flags, go under HANDLE_FLAG_MASK
- */
-enum z3fold_handle_flags {
- HANDLES_NOFREE = 0,
-};
-
-/*
- * Forward declarations
- */
-static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
-static void compact_page_work(struct work_struct *w);
-
-/*****************
- * Helpers
-*****************/
-
-/* Converts an allocation size in bytes to size in z3fold chunks */
-static int size_to_chunks(size_t size)
-{
- return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-#define for_each_unbuddied_list(_iter, _begin) \
- for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-
-static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
- gfp_t gfp)
-{
- struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
- gfp);
-
- if (slots) {
- /* It will be freed separately in free_handle(). */
- kmemleak_not_leak(slots);
- slots->pool = (unsigned long)pool;
- rwlock_init(&slots->lock);
- }
-
- return slots;
-}
-
-static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
-{
- return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
-}
-
-static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
-{
- return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
-}
-
-/* Lock a z3fold page */
-static inline void z3fold_page_lock(struct z3fold_header *zhdr)
-{
- spin_lock(&zhdr->page_lock);
-}
-
-/* Try to lock a z3fold page */
-static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
-{
- return spin_trylock(&zhdr->page_lock);
-}
-
-/* Unlock a z3fold page */
-static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
-{
- spin_unlock(&zhdr->page_lock);
-}
-
-/* return locked z3fold page if it's not headless */
-static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
-{
- struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
- int locked = 0;
-
- if (!(handle & (1 << PAGE_HEADLESS))) {
- slots = handle_to_slots(handle);
- do {
- unsigned long addr;
-
- read_lock(&slots->lock);
- addr = *(unsigned long *)handle;
- zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- locked = z3fold_page_trylock(zhdr);
- read_unlock(&slots->lock);
- if (locked) {
- struct page *page = virt_to_page(zhdr);
-
- if (!test_bit(PAGE_MIGRATED, &page->private))
- break;
- z3fold_page_unlock(zhdr);
- }
- cpu_relax();
- } while (true);
- } else {
- zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
- }
-
- return zhdr;
-}
-
-static inline void put_z3fold_header(struct z3fold_header *zhdr)
-{
- struct page *page = virt_to_page(zhdr);
-
- if (!test_bit(PAGE_HEADLESS, &page->private))
- z3fold_page_unlock(zhdr);
-}
-
-static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
-{
- struct z3fold_buddy_slots *slots;
- int i;
- bool is_free;
-
- if (WARN_ON(*(unsigned long *)handle == 0))
- return;
-
- slots = handle_to_slots(handle);
- write_lock(&slots->lock);
- *(unsigned long *)handle = 0;
-
- if (test_bit(HANDLES_NOFREE, &slots->pool)) {
- write_unlock(&slots->lock);
- return; /* simple case, nothing else to do */
- }
-
- if (zhdr->slots != slots)
- zhdr->foreign_handles--;
-
- is_free = true;
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- write_unlock(&slots->lock);
-
- if (is_free) {
- struct z3fold_pool *pool = slots_to_pool(slots);
-
- if (zhdr->slots == slots)
- zhdr->slots = NULL;
- kmem_cache_free(pool->c_handle, slots);
- }
-}
-
-/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
- struct z3fold_pool *pool, gfp_t gfp)
-{
- struct z3fold_header *zhdr = page_address(page);
- struct z3fold_buddy_slots *slots;
-
- clear_bit(PAGE_HEADLESS, &page->private);
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- clear_bit(NEEDS_COMPACTING, &page->private);
- clear_bit(PAGE_STALE, &page->private);
- clear_bit(PAGE_CLAIMED, &page->private);
- clear_bit(PAGE_MIGRATED, &page->private);
- if (headless)
- return zhdr;
-
- slots = alloc_slots(pool, gfp);
- if (!slots)
- return NULL;
-
- memset(zhdr, 0, sizeof(*zhdr));
- spin_lock_init(&zhdr->page_lock);
- kref_init(&zhdr->refcount);
- zhdr->cpu = -1;
- zhdr->slots = slots;
- zhdr->pool = pool;
- INIT_LIST_HEAD(&zhdr->buddy);
- INIT_WORK(&zhdr->work, compact_page_work);
- return zhdr;
-}
-
-/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page, bool headless)
-{
- if (!headless) {
- lock_page(page);
- __ClearPageMovable(page);
- unlock_page(page);
- }
- __free_page(page);
-}
-
-/* Helper function to build the index */
-static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
-{
- return (bud + zhdr->first_num) & BUDDY_MASK;
-}
-
-/*
- * Encodes the handle of a particular buddy within a z3fold page.
- * Zhdr->page_lock should be held as this function accesses first_num
- * if bud != HEADLESS.
- */
-static unsigned long __encode_handle(struct z3fold_header *zhdr,
- struct z3fold_buddy_slots *slots,
- enum buddy bud)
-{
- unsigned long h = (unsigned long)zhdr;
- int idx = 0;
-
- /*
- * For a headless page, its handle is its pointer with the extra
- * PAGE_HEADLESS bit set
- */
- if (bud == HEADLESS)
- return h | (1 << PAGE_HEADLESS);
-
- /* otherwise, return pointer to encoded handle */
- idx = __idx(zhdr, bud);
- h += idx;
- if (bud == LAST)
- h |= (zhdr->last_chunks << BUDDY_SHIFT);
-
- write_lock(&slots->lock);
- slots->slot[idx] = h;
- write_unlock(&slots->lock);
- return (unsigned long)&slots->slot[idx];
-}
-
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
-{
- return __encode_handle(zhdr, zhdr->slots, bud);
-}
-
-/* only for LAST bud, returns zero otherwise */
-static unsigned short handle_to_chunks(unsigned long handle)
-{
- struct z3fold_buddy_slots *slots = handle_to_slots(handle);
- unsigned long addr;
-
- read_lock(&slots->lock);
- addr = *(unsigned long *)handle;
- read_unlock(&slots->lock);
- return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
-}
-
-/*
- * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
- * but that doesn't matter. because the masking will result in the
- * correct buddy number.
- */
-static enum buddy handle_to_buddy(unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct z3fold_buddy_slots *slots = handle_to_slots(handle);
- unsigned long addr;
-
- read_lock(&slots->lock);
- WARN_ON(handle & (1 << PAGE_HEADLESS));
- addr = *(unsigned long *)handle;
- read_unlock(&slots->lock);
- zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- return (addr - zhdr->first_num) & BUDDY_MASK;
-}
-
-static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
-{
- return zhdr->pool;
-}
-
-static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
-{
- struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
-
- WARN_ON(!list_empty(&zhdr->buddy));
- set_bit(PAGE_STALE, &page->private);
- clear_bit(NEEDS_COMPACTING, &page->private);
- spin_lock(&pool->lock);
- spin_unlock(&pool->lock);
-
- if (locked)
- z3fold_page_unlock(zhdr);
-
- spin_lock(&pool->stale_lock);
- list_add(&zhdr->buddy, &pool->stale);
- queue_work(pool->release_wq, &pool->work);
- spin_unlock(&pool->stale_lock);
-
- atomic64_dec(&pool->pages_nr);
-}
-
-static void release_z3fold_page_locked(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- WARN_ON(z3fold_page_trylock(zhdr));
- __release_z3fold_page(zhdr, true);
-}
-
-static void release_z3fold_page_locked_list(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
-
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- WARN_ON(z3fold_page_trylock(zhdr));
- __release_z3fold_page(zhdr, true);
-}
-
-static inline int put_z3fold_locked(struct z3fold_header *zhdr)
-{
- return kref_put(&zhdr->refcount, release_z3fold_page_locked);
-}
-
-static inline int put_z3fold_locked_list(struct z3fold_header *zhdr)
-{
- return kref_put(&zhdr->refcount, release_z3fold_page_locked_list);
-}
-
-static void free_pages_work(struct work_struct *w)
-{
- struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
-
- spin_lock(&pool->stale_lock);
- while (!list_empty(&pool->stale)) {
- struct z3fold_header *zhdr = list_first_entry(&pool->stale,
- struct z3fold_header, buddy);
- struct page *page = virt_to_page(zhdr);
-
- list_del(&zhdr->buddy);
- if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
- continue;
- spin_unlock(&pool->stale_lock);
- cancel_work_sync(&zhdr->work);
- free_z3fold_page(page, false);
- cond_resched();
- spin_lock(&pool->stale_lock);
- }
- spin_unlock(&pool->stale_lock);
-}
-
-/*
- * Returns the number of free chunks in a z3fold page.
- * NB: can't be used with HEADLESS pages.
- */
-static int num_free_chunks(struct z3fold_header *zhdr)
-{
- int nfree;
- /*
- * If there is a middle object, pick up the bigger free space
- * either before or after it. Otherwise just subtract the number
- * of chunks occupied by the first and the last objects.
- */
- if (zhdr->middle_chunks != 0) {
- int nfree_before = zhdr->first_chunks ?
- 0 : zhdr->start_middle - ZHDR_CHUNKS;
- int nfree_after = zhdr->last_chunks ?
- 0 : TOTAL_CHUNKS -
- (zhdr->start_middle + zhdr->middle_chunks);
- nfree = max(nfree_before, nfree_after);
- } else
- nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
- return nfree;
-}
-
-/* Add to the appropriate unbuddied list */
-static inline void add_to_unbuddied(struct z3fold_pool *pool,
- struct z3fold_header *zhdr)
-{
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied;
- int freechunks = num_free_chunks(zhdr);
-
- migrate_disable();
- unbuddied = this_cpu_ptr(pool->unbuddied);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- migrate_enable();
- }
-}
-
-static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
-{
- enum buddy bud = HEADLESS;
-
- if (zhdr->middle_chunks) {
- if (!zhdr->first_chunks &&
- chunks <= zhdr->start_middle - ZHDR_CHUNKS)
- bud = FIRST;
- else if (!zhdr->last_chunks)
- bud = LAST;
- } else {
- if (!zhdr->first_chunks)
- bud = FIRST;
- else if (!zhdr->last_chunks)
- bud = LAST;
- else
- bud = MIDDLE;
- }
-
- return bud;
-}
-
-static inline void *mchunk_memmove(struct z3fold_header *zhdr,
- unsigned short dst_chunk)
-{
- void *beg = zhdr;
- return memmove(beg + (dst_chunk << CHUNK_SHIFT),
- beg + (zhdr->start_middle << CHUNK_SHIFT),
- zhdr->middle_chunks << CHUNK_SHIFT);
-}
-
-static inline bool buddy_single(struct z3fold_header *zhdr)
-{
- return !((zhdr->first_chunks && zhdr->middle_chunks) ||
- (zhdr->first_chunks && zhdr->last_chunks) ||
- (zhdr->middle_chunks && zhdr->last_chunks));
-}
-
-static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
-{
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- void *p = zhdr;
- unsigned long old_handle = 0;
- size_t sz = 0;
- struct z3fold_header *new_zhdr = NULL;
- int first_idx = __idx(zhdr, FIRST);
- int middle_idx = __idx(zhdr, MIDDLE);
- int last_idx = __idx(zhdr, LAST);
- unsigned short *moved_chunks = NULL;
-
- /*
- * No need to protect slots here -- all the slots are "local" and
- * the page lock is already taken
- */
- if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
- p += ZHDR_SIZE_ALIGNED;
- sz = zhdr->first_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
- moved_chunks = &zhdr->first_chunks;
- } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
- p += zhdr->start_middle << CHUNK_SHIFT;
- sz = zhdr->middle_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
- moved_chunks = &zhdr->middle_chunks;
- } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
- p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
- sz = zhdr->last_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
- moved_chunks = &zhdr->last_chunks;
- }
-
- if (sz > 0) {
- enum buddy new_bud = HEADLESS;
- short chunks = size_to_chunks(sz);
- void *q;
-
- new_zhdr = __z3fold_alloc(pool, sz, false);
- if (!new_zhdr)
- return NULL;
-
- if (WARN_ON(new_zhdr == zhdr))
- goto out_fail;
-
- new_bud = get_free_buddy(new_zhdr, chunks);
- q = new_zhdr;
- switch (new_bud) {
- case FIRST:
- new_zhdr->first_chunks = chunks;
- q += ZHDR_SIZE_ALIGNED;
- break;
- case MIDDLE:
- new_zhdr->middle_chunks = chunks;
- new_zhdr->start_middle =
- new_zhdr->first_chunks + ZHDR_CHUNKS;
- q += new_zhdr->start_middle << CHUNK_SHIFT;
- break;
- case LAST:
- new_zhdr->last_chunks = chunks;
- q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
- break;
- default:
- goto out_fail;
- }
- new_zhdr->foreign_handles++;
- memcpy(q, p, sz);
- write_lock(&zhdr->slots->lock);
- *(unsigned long *)old_handle = (unsigned long)new_zhdr +
- __idx(new_zhdr, new_bud);
- if (new_bud == LAST)
- *(unsigned long *)old_handle |=
- (new_zhdr->last_chunks << BUDDY_SHIFT);
- write_unlock(&zhdr->slots->lock);
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
-
- *moved_chunks = 0;
- }
-
- return new_zhdr;
-
-out_fail:
- if (new_zhdr && !put_z3fold_locked(new_zhdr)) {
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
- }
- return NULL;
-
-}
-
-#define BIG_CHUNK_GAP 3
-/* Has to be called with lock held */
-static int z3fold_compact_page(struct z3fold_header *zhdr)
-{
- struct page *page = virt_to_page(zhdr);
-
- if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
- return 0; /* can't move middle chunk, it's used */
-
- if (unlikely(PageIsolated(page)))
- return 0;
-
- if (zhdr->middle_chunks == 0)
- return 0; /* nothing to compact */
-
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /* move to the beginning */
- mchunk_memmove(zhdr, ZHDR_CHUNKS);
- zhdr->first_chunks = zhdr->middle_chunks;
- zhdr->middle_chunks = 0;
- zhdr->start_middle = 0;
- zhdr->first_num++;
- return 1;
- }
-
- /*
- * moving data is expensive, so let's only do that if
- * there's substantial gain (at least BIG_CHUNK_GAP chunks)
- */
- if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
- zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
- BIG_CHUNK_GAP) {
- mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
- zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
- return 1;
- } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
- TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
- + zhdr->middle_chunks) >=
- BIG_CHUNK_GAP) {
- unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
- zhdr->middle_chunks;
- mchunk_memmove(zhdr, new_start);
- zhdr->start_middle = new_start;
- return 1;
- }
-
- return 0;
-}
-
-static void do_compact_page(struct z3fold_header *zhdr, bool locked)
-{
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- struct page *page;
-
- page = virt_to_page(zhdr);
- if (locked)
- WARN_ON(z3fold_page_trylock(zhdr));
- else
- z3fold_page_lock(zhdr);
- if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
- z3fold_page_unlock(zhdr);
- return;
- }
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- if (put_z3fold_locked(zhdr))
- return;
-
- if (test_bit(PAGE_STALE, &page->private) ||
- test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- return;
- }
-
- if (!zhdr->foreign_handles && buddy_single(zhdr) &&
- zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (!put_z3fold_locked(zhdr)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- }
- return;
- }
-
- z3fold_compact_page(zhdr);
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
-}
-
-static void compact_page_work(struct work_struct *w)
-{
- struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
- work);
-
- do_compact_page(zhdr, false);
-}
-
-/* returns _locked_ z3fold page header or NULL */
-static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
- size_t size, bool can_sleep)
-{
- struct z3fold_header *zhdr = NULL;
- struct page *page;
- struct list_head *unbuddied;
- int chunks = size_to_chunks(size), i;
-
-lookup:
- migrate_disable();
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = this_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- zhdr = NULL;
- migrate_enable();
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- migrate_enable();
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- migrate_enable();
-
- if (!zhdr) {
- int cpu;
-
- /* look for _exact_ match on other cpus' lists */
- for_each_online_cpu(cpu) {
- struct list_head *l;
-
- unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
- spin_lock(&pool->lock);
- l = &unbuddied[chunks];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr || !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- zhdr = NULL;
- continue;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- if (can_sleep)
- cond_resched();
- continue;
- }
- kref_get(&zhdr->refcount);
- break;
- }
- }
-
- if (zhdr && !zhdr->slots) {
- zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
- if (!zhdr->slots)
- goto out_fail;
- }
- return zhdr;
-
-out_fail:
- if (!put_z3fold_locked(zhdr)) {
- add_to_unbuddied(pool, zhdr);
- z3fold_page_unlock(zhdr);
- }
- return NULL;
-}
-
-/*
- * API Functions
- */
-
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @name: pool name
- * @gfp: gfp flags when allocating the z3fold pool structure
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
-{
- struct z3fold_pool *pool = NULL;
- int i, cpu;
-
- pool = kzalloc(sizeof(struct z3fold_pool), gfp);
- if (!pool)
- goto out;
- pool->c_handle = kmem_cache_create("z3fold_handle",
- sizeof(struct z3fold_buddy_slots),
- SLOTS_ALIGN, 0, NULL);
- if (!pool->c_handle)
- goto out_c;
- spin_lock_init(&pool->lock);
- spin_lock_init(&pool->stale_lock);
- pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
- __alignof__(struct list_head));
- if (!pool->unbuddied)
- goto out_pool;
- for_each_possible_cpu(cpu) {
- struct list_head *unbuddied =
- per_cpu_ptr(pool->unbuddied, cpu);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&unbuddied[i]);
- }
- INIT_LIST_HEAD(&pool->stale);
- atomic64_set(&pool->pages_nr, 0);
- pool->name = name;
- pool->compact_wq = create_singlethread_workqueue(pool->name);
- if (!pool->compact_wq)
- goto out_unbuddied;
- pool->release_wq = create_singlethread_workqueue(pool->name);
- if (!pool->release_wq)
- goto out_wq;
- INIT_WORK(&pool->work, free_pages_work);
- return pool;
-
-out_wq:
- destroy_workqueue(pool->compact_wq);
-out_unbuddied:
- free_percpu(pool->unbuddied);
-out_pool:
- kmem_cache_destroy(pool->c_handle);
-out_c:
- kfree(pool);
-out:
- return NULL;
-}
-
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool: the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
- kmem_cache_destroy(pool->c_handle);
-
- /*
- * We need to destroy pool->compact_wq before pool->release_wq,
- * as any pending work on pool->compact_wq will call
- * queue_work(pool->release_wq, &pool->work).
- *
- * There are still outstanding pages until both workqueues are drained,
- * so we cannot unregister migration until then.
- */
-
- destroy_workqueue(pool->compact_wq);
- destroy_workqueue(pool->release_wq);
- free_percpu(pool->unbuddied);
- kfree(pool);
-}
-
-static const struct movable_operations z3fold_mops;
-
-/**
- * z3fold_alloc() - allocates a region of a given size
- * @pool: z3fold pool from which to allocate
- * @size: size in bytes of the desired allocation
- * @gfp: gfp flags used if the pool needs to grow
- * @handle: handle of the new allocation
- *
- * This function will attempt to find a free region in the pool large enough to
- * satisfy the allocation request. A search of the unbuddied lists is
- * performed first. If no suitable free region is found, then a new page is
- * allocated and added to the pool to satisfy the request.
- *
- * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
- * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
- * a new page.
- */
-static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- int chunks = size_to_chunks(size);
- struct z3fold_header *zhdr = NULL;
- struct page *page = NULL;
- enum buddy bud;
- bool can_sleep = gfpflags_allow_blocking(gfp);
-
- if (!size || (gfp & __GFP_HIGHMEM))
- return -EINVAL;
-
- if (size > PAGE_SIZE)
- return -ENOSPC;
-
- if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
- bud = HEADLESS;
- else {
-retry:
- zhdr = __z3fold_alloc(pool, size, can_sleep);
- if (zhdr) {
- bud = get_free_buddy(zhdr, chunks);
- if (bud == HEADLESS) {
- if (!put_z3fold_locked(zhdr))
- z3fold_page_unlock(zhdr);
- pr_err("No free chunks in unbuddied\n");
- WARN_ON(1);
- goto retry;
- }
- page = virt_to_page(zhdr);
- goto found;
- }
- bud = FIRST;
- }
-
- page = alloc_page(gfp);
- if (!page)
- return -ENOMEM;
-
- zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
- if (!zhdr) {
- __free_page(page);
- return -ENOMEM;
- }
- atomic64_inc(&pool->pages_nr);
-
- if (bud == HEADLESS) {
- set_bit(PAGE_HEADLESS, &page->private);
- goto headless;
- }
- if (can_sleep) {
- lock_page(page);
- __SetPageMovable(page, &z3fold_mops);
- unlock_page(page);
- } else {
- WARN_ON(!trylock_page(page));
- __SetPageMovable(page, &z3fold_mops);
- unlock_page(page);
- }
- z3fold_page_lock(zhdr);
-
-found:
- if (bud == FIRST)
- zhdr->first_chunks = chunks;
- else if (bud == LAST)
- zhdr->last_chunks = chunks;
- else {
- zhdr->middle_chunks = chunks;
- zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
- }
- add_to_unbuddied(pool, zhdr);
-
-headless:
- spin_lock(&pool->lock);
- *handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
- if (bud != HEADLESS)
- z3fold_page_unlock(zhdr);
-
- return 0;
-}
-
-/**
- * z3fold_free() - frees the allocation associated with the given handle
- * @pool: pool in which the allocation resided
- * @handle: handle associated with the allocation returned by z3fold_alloc()
- *
- * In the case that the z3fold page in which the allocation resides is under
- * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
- * only sets the first|middle|last_chunks to 0. The page is actually freed
- * once all buddies are evicted (see z3fold_reclaim_page() below).
- */
-static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- enum buddy bud;
- bool page_claimed;
-
- zhdr = get_z3fold_header(handle);
- page = virt_to_page(zhdr);
- page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
-
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- /* if a headless page is under reclaim, just leave.
- * NB: we use test_and_set_bit for a reason: if the bit
- * has not been set before, we release this page
- * immediately so we don't care about its value any more.
- */
- if (!page_claimed) {
- put_z3fold_header(zhdr);
- free_z3fold_page(page, true);
- atomic64_dec(&pool->pages_nr);
- }
- return;
- }
-
- /* Non-headless case */
- bud = handle_to_buddy(handle);
-
- switch (bud) {
- case FIRST:
- zhdr->first_chunks = 0;
- break;
- case MIDDLE:
- zhdr->middle_chunks = 0;
- break;
- case LAST:
- zhdr->last_chunks = 0;
- break;
- default:
- pr_err("%s: unknown bud %d\n", __func__, bud);
- WARN_ON(1);
- put_z3fold_header(zhdr);
- return;
- }
-
- if (!page_claimed)
- free_handle(handle, zhdr);
- if (put_z3fold_locked_list(zhdr))
- return;
- if (page_claimed) {
- /* the page has not been claimed by us */
- put_z3fold_header(zhdr);
- return;
- }
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- put_z3fold_header(zhdr);
- return;
- }
- if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
- zhdr->cpu = -1;
- kref_get(&zhdr->refcount);
- clear_bit(PAGE_CLAIMED, &page->private);
- do_compact_page(zhdr, true);
- return;
- }
- kref_get(&zhdr->refcount);
- clear_bit(PAGE_CLAIMED, &page->private);
- queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
- put_z3fold_header(zhdr);
-}
-
-/**
- * z3fold_map() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be mapped
- *
- * Extracts the buddy number from handle and constructs the pointer to the
- * correct starting chunk within the page.
- *
- * Returns: a pointer to the mapped allocation
- */
-static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- void *addr;
- enum buddy buddy;
-
- zhdr = get_z3fold_header(handle);
- addr = zhdr;
- page = virt_to_page(zhdr);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- goto out;
-
- buddy = handle_to_buddy(handle);
- switch (buddy) {
- case FIRST:
- addr += ZHDR_SIZE_ALIGNED;
- break;
- case MIDDLE:
- addr += zhdr->start_middle << CHUNK_SHIFT;
- set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- break;
- case LAST:
- addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
- break;
- default:
- pr_err("unknown buddy id %d\n", buddy);
- WARN_ON(1);
- addr = NULL;
- break;
- }
-
- if (addr)
- zhdr->mapped_count++;
-out:
- put_z3fold_header(zhdr);
- return addr;
-}
-
-/**
- * z3fold_unmap() - unmaps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be unmapped
- */
-static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- enum buddy buddy;
-
- zhdr = get_z3fold_header(handle);
- page = virt_to_page(zhdr);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- return;
-
- buddy = handle_to_buddy(handle);
- if (buddy == MIDDLE)
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- zhdr->mapped_count--;
- put_z3fold_header(zhdr);
-}
-
-/**
- * z3fold_get_pool_pages() - gets the z3fold pool size in pages
- * @pool: pool whose size is being queried
- *
- * Returns: size in pages of the given pool.
- */
-static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
-{
- return atomic64_read(&pool->pages_nr);
-}
-
-static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
-{
- struct z3fold_header *zhdr;
- struct z3fold_pool *pool;
-
- VM_BUG_ON_PAGE(PageIsolated(page), page);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- return false;
-
- zhdr = page_address(page);
- z3fold_page_lock(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_STALE, &page->private))
- goto out;
-
- if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
- goto out;
-
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
- goto out;
- pool = zhdr_to_pool(zhdr);
- spin_lock(&pool->lock);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- kref_get(&zhdr->refcount);
- z3fold_page_unlock(zhdr);
- return true;
-
-out:
- z3fold_page_unlock(zhdr);
- return false;
-}
-
-static int z3fold_page_migrate(struct page *newpage, struct page *page,
- enum migrate_mode mode)
-{
- struct z3fold_header *zhdr, *new_zhdr;
- struct z3fold_pool *pool;
-
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-
- zhdr = page_address(page);
- pool = zhdr_to_pool(zhdr);
-
- if (!z3fold_page_trylock(zhdr))
- return -EAGAIN;
- if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- return -EBUSY;
- }
- if (work_pending(&zhdr->work)) {
- z3fold_page_unlock(zhdr);
- return -EAGAIN;
- }
- new_zhdr = page_address(newpage);
- memcpy(new_zhdr, zhdr, PAGE_SIZE);
- newpage->private = page->private;
- set_bit(PAGE_MIGRATED, &page->private);
- z3fold_page_unlock(zhdr);
- spin_lock_init(&new_zhdr->page_lock);
- INIT_WORK(&new_zhdr->work, compact_page_work);
- /*
- * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
- * so we only have to reinitialize it.
- */
- INIT_LIST_HEAD(&new_zhdr->buddy);
- __ClearPageMovable(page);
-
- get_page(newpage);
- z3fold_page_lock(new_zhdr);
- if (new_zhdr->first_chunks)
- encode_handle(new_zhdr, FIRST);
- if (new_zhdr->last_chunks)
- encode_handle(new_zhdr, LAST);
- if (new_zhdr->middle_chunks)
- encode_handle(new_zhdr, MIDDLE);
- set_bit(NEEDS_COMPACTING, &newpage->private);
- new_zhdr->cpu = smp_processor_id();
- __SetPageMovable(newpage, &z3fold_mops);
- z3fold_page_unlock(new_zhdr);
-
- queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
-
- /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
- page->private = 0;
- put_page(page);
- return 0;
-}
-
-static void z3fold_page_putback(struct page *page)
-{
- struct z3fold_header *zhdr;
- struct z3fold_pool *pool;
-
- zhdr = page_address(page);
- pool = zhdr_to_pool(zhdr);
-
- z3fold_page_lock(zhdr);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- INIT_LIST_HEAD(&page->lru);
- if (put_z3fold_locked(zhdr))
- return;
- if (list_empty(&zhdr->buddy))
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
-}
-
-static const struct movable_operations z3fold_mops = {
- .isolate_page = z3fold_page_isolate,
- .migrate_page = z3fold_page_migrate,
- .putback_page = z3fold_page_putback,
-};
-
-/*****************
- * zpool
- ****************/
-
-static void *z3fold_zpool_create(const char *name, gfp_t gfp)
-{
- return z3fold_create_pool(name, gfp);
-}
-
-static void z3fold_zpool_destroy(void *pool)
-{
- z3fold_destroy_pool(pool);
-}
-
-static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return z3fold_alloc(pool, size, gfp, handle);
-}
-static void z3fold_zpool_free(void *pool, unsigned long handle)
-{
- z3fold_free(pool, handle);
-}
-
-static void *z3fold_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return z3fold_map(pool, handle);
-}
-static void z3fold_zpool_unmap(void *pool, unsigned long handle)
-{
- z3fold_unmap(pool, handle);
-}
-
-static u64 z3fold_zpool_total_pages(void *pool)
-{
- return z3fold_get_pool_pages(pool);
-}
-
-static struct zpool_driver z3fold_zpool_driver = {
- .type = "z3fold",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = z3fold_zpool_create,
- .destroy = z3fold_zpool_destroy,
- .malloc = z3fold_zpool_malloc,
- .free = z3fold_zpool_free,
- .map = z3fold_zpool_map,
- .unmap = z3fold_zpool_unmap,
- .total_pages = z3fold_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-z3fold");
-
-static int __init init_z3fold(void)
-{
- /*
- * Make sure the z3fold header is not larger than the page size and
- * there has remaining spaces for its buddy.
- */
- BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
- zpool_register_driver(&z3fold_zpool_driver);
-
- return 0;
-}
-
-static void __exit exit_z3fold(void)
-{
- zpool_unregister_driver(&z3fold_zpool_driver);
-}
-
-module_init(init_z3fold);
-module_exit(exit_z3fold);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
-MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
diff --git a/mm/zbud.c b/mm/zbud.c
deleted file mode 100644
index e9836fff9438..000000000000
--- a/mm/zbud.c
+++ /dev/null
@@ -1,455 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * zbud.c
- *
- * Copyright (C) 2013, Seth Jennings, IBM
- *
- * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
- *
- * zbud is an special purpose allocator for storing compressed pages. Contrary
- * to what its name may suggest, zbud is not a buddy allocator, but rather an
- * allocator that "buddies" two compressed pages together in a single memory
- * page.
- *
- * While this design limits storage density, it has simple and deterministic
- * reclaim properties that make it preferable to a higher density approach when
- * reclaim will be used.
- *
- * zbud works by storing compressed pages, or "zpages", together in pairs in a
- * single memory page called a "zbud page". The first buddy is "left
- * justified" at the beginning of the zbud page, and the last buddy is "right
- * justified" at the end of the zbud page. The benefit is that if either
- * buddy is freed, the freed buddy space, coalesced with whatever slack space
- * that existed between the buddies, results in the largest possible free region
- * within the zbud page.
- *
- * zbud also provides an attractive lower bound on density. The ratio of zpages
- * to zbud pages can not be less than 1. This ensures that zbud can never "do
- * harm" by using more pages to store zpages than the uncompressed zpages would
- * have used on their own.
- *
- * zbud pages are divided into "chunks". The size of the chunks is fixed at
- * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
- * into chunks allows organizing unbuddied zbud pages into a manageable number
- * of unbuddied lists according to the number of free chunks available in the
- * zbud page.
- *
- * The zbud API differs from that of conventional allocators in that the
- * allocation function, zbud_alloc(), returns an opaque handle to the user,
- * not a dereferenceable pointer. The user must map the handle using
- * zbud_map() in order to get a usable pointer by which to access the
- * allocation data and unmap the handle with zbud_unmap() when operations
- * on the allocation data are complete.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/atomic.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/zpool.h>
-
-/*****************
- * Structures
-*****************/
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
- * 63 which shows the max number of free chunks in zbud page, also there will be
- * 63 freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-struct zbud_pool;
-
-/**
- * struct zbud_pool - stores metadata for each zbud pool
- * @lock: protects all pool fields and first|last_chunk fields of any
- * zbud page in the pool
- * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
- * the lists each zbud page is added to depends on the size of
- * its free region.
- * @buddied: list tracking the zbud pages that contain two buddies;
- * these zbud pages are full
- * @pages_nr: number of zbud pages in the pool.
- *
- * This structure is allocated at pool creation time and maintains metadata
- * pertaining to a particular zbud pool.
- */
-struct zbud_pool {
- spinlock_t lock;
- union {
- /*
- * Reuse unbuddied[0] as buddied on the ground that
- * unbuddied[0] is unused.
- */
- struct list_head buddied;
- struct list_head unbuddied[NCHUNKS];
- };
- u64 pages_nr;
-};
-
-/*
- * struct zbud_header - zbud page metadata occupying the first chunk of each
- * zbud page.
- * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- */
-struct zbud_header {
- struct list_head buddy;
- unsigned int first_chunks;
- unsigned int last_chunks;
-};
-
-/*****************
- * Helpers
-*****************/
-/* Just to make the code easier to read */
-enum buddy {
- FIRST,
- LAST
-};
-
-/* Converts an allocation size in bytes to size in zbud chunks */
-static int size_to_chunks(size_t size)
-{
- return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-#define for_each_unbuddied_list(_iter, _begin) \
- for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-
-/* Initializes the zbud header of a newly allocated zbud page */
-static struct zbud_header *init_zbud_page(struct page *page)
-{
- struct zbud_header *zhdr = page_address(page);
- zhdr->first_chunks = 0;
- zhdr->last_chunks = 0;
- INIT_LIST_HEAD(&zhdr->buddy);
- return zhdr;
-}
-
-/* Resets the struct page fields and frees the page */
-static void free_zbud_page(struct zbud_header *zhdr)
-{
- __free_page(virt_to_page(zhdr));
-}
-
-/*
- * Encodes the handle of a particular buddy within a zbud page
- * Pool lock should be held as this function accesses first|last_chunks
- */
-static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
-{
- unsigned long handle;
-
- /*
- * For now, the encoded handle is actually just the pointer to the data
- * but this might not always be the case. A little information hiding.
- * Add CHUNK_SIZE to the handle if it is the first allocation to jump
- * over the zbud header in the first chunk.
- */
- handle = (unsigned long)zhdr;
- if (bud == FIRST)
- /* skip over zbud header */
- handle += ZHDR_SIZE_ALIGNED;
- else /* bud == LAST */
- handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
- return handle;
-}
-
-/* Returns the zbud page where a given handle is stored */
-static struct zbud_header *handle_to_zbud_header(unsigned long handle)
-{
- return (struct zbud_header *)(handle & PAGE_MASK);
-}
-
-/* Returns the number of free chunks in a zbud page */
-static int num_free_chunks(struct zbud_header *zhdr)
-{
- /*
- * Rather than branch for different situations, just use the fact that
- * free buddies have a length of zero to simplify everything.
- */
- return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
-}
-
-/*****************
- * API Functions
-*****************/
-/**
- * zbud_create_pool() - create a new zbud pool
- * @gfp: gfp flags when allocating the zbud pool structure
- *
- * Return: pointer to the new zbud pool or NULL if the metadata allocation
- * failed.
- */
-static struct zbud_pool *zbud_create_pool(gfp_t gfp)
-{
- struct zbud_pool *pool;
- int i;
-
- pool = kzalloc(sizeof(struct zbud_pool), gfp);
- if (!pool)
- return NULL;
- spin_lock_init(&pool->lock);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&pool->unbuddied[i]);
- INIT_LIST_HEAD(&pool->buddied);
- pool->pages_nr = 0;
- return pool;
-}
-
-/**
- * zbud_destroy_pool() - destroys an existing zbud pool
- * @pool: the zbud pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void zbud_destroy_pool(struct zbud_pool *pool)
-{
- kfree(pool);
-}
-
-/**
- * zbud_alloc() - allocates a region of a given size
- * @pool: zbud pool from which to allocate
- * @size: size in bytes of the desired allocation
- * @gfp: gfp flags used if the pool needs to grow
- * @handle: handle of the new allocation
- *
- * This function will attempt to find a free region in the pool large enough to
- * satisfy the allocation request. A search of the unbuddied lists is
- * performed first. If no suitable free region is found, then a new page is
- * allocated and added to the pool to satisfy the request.
- *
- * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
- * as zbud pool pages.
- *
- * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
- * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
- * a new page.
- */
-static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- int chunks, i, freechunks;
- struct zbud_header *zhdr = NULL;
- enum buddy bud;
- struct page *page;
-
- if (!size || (gfp & __GFP_HIGHMEM))
- return -EINVAL;
- if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
- return -ENOSPC;
- chunks = size_to_chunks(size);
- spin_lock(&pool->lock);
-
- /* First, try to find an unbuddied zbud page. */
- for_each_unbuddied_list(i, chunks) {
- if (!list_empty(&pool->unbuddied[i])) {
- zhdr = list_first_entry(&pool->unbuddied[i],
- struct zbud_header, buddy);
- list_del(&zhdr->buddy);
- if (zhdr->first_chunks == 0)
- bud = FIRST;
- else
- bud = LAST;
- goto found;
- }
- }
-
- /* Couldn't find unbuddied zbud page, create new one */
- spin_unlock(&pool->lock);
- page = alloc_page(gfp);
- if (!page)
- return -ENOMEM;
- spin_lock(&pool->lock);
- pool->pages_nr++;
- zhdr = init_zbud_page(page);
- bud = FIRST;
-
-found:
- if (bud == FIRST)
- zhdr->first_chunks = chunks;
- else
- zhdr->last_chunks = chunks;
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* Add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
- }
-
- *handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
-
- return 0;
-}
-
-/**
- * zbud_free() - frees the allocation associated with the given handle
- * @pool: pool in which the allocation resided
- * @handle: handle associated with the allocation returned by zbud_alloc()
- */
-static void zbud_free(struct zbud_pool *pool, unsigned long handle)
-{
- struct zbud_header *zhdr;
- int freechunks;
-
- spin_lock(&pool->lock);
- zhdr = handle_to_zbud_header(handle);
-
- /* If first buddy, handle will be page aligned */
- if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
- zhdr->last_chunks = 0;
- else
- zhdr->first_chunks = 0;
-
- /* Remove from existing buddy list */
- list_del(&zhdr->buddy);
-
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /* zbud page is empty, free */
- free_zbud_page(zhdr);
- pool->pages_nr--;
- } else {
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- }
-
- spin_unlock(&pool->lock);
-}
-
-/**
- * zbud_map() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be mapped
- *
- * While trivial for zbud, the mapping functions for others allocators
- * implementing this allocation API could have more complex information encoded
- * in the handle and could create temporary mappings to make the data
- * accessible to the user.
- *
- * Returns: a pointer to the mapped allocation
- */
-static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
-{
- return (void *)(handle);
-}
-
-/**
- * zbud_unmap() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be unmapped
- */
-static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
-{
-}
-
-/**
- * zbud_get_pool_pages() - gets the zbud pool size in pages
- * @pool: pool whose size is being queried
- *
- * Returns: size in pages of the given pool. The pool lock need not be
- * taken to access pages_nr.
- */
-static u64 zbud_get_pool_pages(struct zbud_pool *pool)
-{
- return pool->pages_nr;
-}
-
-/*****************
- * zpool
- ****************/
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp)
-{
- return zbud_create_pool(gfp);
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
- zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
- zbud_free(pool, handle);
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
- zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_pages(void *pool)
-{
- return zbud_get_pool_pages(pool);
-}
-
-static struct zpool_driver zbud_zpool_driver = {
- .type = "zbud",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = zbud_zpool_create,
- .destroy = zbud_zpool_destroy,
- .malloc = zbud_zpool_malloc,
- .free = zbud_zpool_free,
- .map = zbud_zpool_map,
- .unmap = zbud_zpool_unmap,
- .total_pages = zbud_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-zbud");
-
-static int __init init_zbud(void)
-{
- /* Make sure the zbud header will fit in one chunk */
- BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
- pr_info("loaded\n");
-
- zpool_register_driver(&zbud_zpool_driver);
-
- return 0;
-}
-
-static void __exit exit_zbud(void)
-{
- zpool_unregister_driver(&zbud_zpool_driver);
- pr_info("unloaded\n");
-}
-
-module_init(init_zbud);
-module_exit(exit_zbud);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
-MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
new file mode 100644
index 000000000000..d3df316e5bb7
--- /dev/null
+++ b/mm/zpdesc.h
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* zpdesc.h: zswap.zpool memory descriptor
+ *
+ * Written by Alex Shi <alexs@kernel.org>
+ * Hyeonggon Yoo <42.hyeyoo@gmail.com>
+ */
+#ifndef __MM_ZPDESC_H__
+#define __MM_ZPDESC_H__
+
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
+/*
+ * struct zpdesc - Memory descriptor for zpool memory.
+ * @flags: Page flags, mostly unused by zsmalloc.
+ * @lru: Indirectly used by page migration.
+ * @movable_ops: Used by page migration.
+ * @next: Next zpdesc in a zspage in zsmalloc zpool.
+ * @handle: For huge zspage in zsmalloc zpool.
+ * @zspage: Points to the zspage this zpdesc is a part of.
+ * @first_obj_offset: First object offset in zsmalloc zpool.
+ * @_refcount: The number of references to this zpdesc.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues. In particular, do not expand into the overlap
+ * with memcg_data.
+ *
+ * Page flags used:
+ * * PG_private identifies the first component page.
+ * * PG_locked is used by page migration code.
+ */
+struct zpdesc {
+ unsigned long flags;
+ struct list_head lru;
+ unsigned long movable_ops;
+ union {
+ struct zpdesc *next;
+ unsigned long handle;
+ };
+ struct zspage *zspage;
+ /*
+ * Only the lower 24 bits are available for offset, limiting a page
+ * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc.
+ *
+ * Do not access this field directly.
+ * Instead, use {get,set}_first_obj_offset() helpers.
+ */
+ unsigned int first_obj_offset;
+ atomic_t _refcount;
+};
+#define ZPDESC_MATCH(pg, zp) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp))
+
+ZPDESC_MATCH(flags, flags);
+ZPDESC_MATCH(lru, lru);
+ZPDESC_MATCH(mapping, movable_ops);
+ZPDESC_MATCH(__folio_index, next);
+ZPDESC_MATCH(__folio_index, handle);
+ZPDESC_MATCH(private, zspage);
+ZPDESC_MATCH(page_type, first_obj_offset);
+ZPDESC_MATCH(_refcount, _refcount);
+#undef ZPDESC_MATCH
+static_assert(sizeof(struct zpdesc) <= sizeof(struct page));
+
+/*
+ * zpdesc_page - The first struct page allocated for a zpdesc
+ * @zp: The zpdesc.
+ *
+ * A convenience wrapper for converting zpdesc to the first struct page of the
+ * underlying folio, to communicate with code not yet converted to folio or
+ * struct zpdesc.
+ *
+ */
+#define zpdesc_page(zp) (_Generic((zp), \
+ const struct zpdesc *: (const struct page *)(zp), \
+ struct zpdesc *: (struct page *)(zp)))
+
+/**
+ * zpdesc_folio - The folio allocated for a zpdesc
+ * @zp: The zpdesc.
+ *
+ * Zpdescs are descriptors for zpool memory. The zpool memory itself is
+ * allocated as folios that contain the zpool objects, and zpdesc uses specific
+ * fields in the first struct page of the folio - those fields are now accessed
+ * by struct zpdesc.
+ *
+ * It is occasionally necessary convert to back to a folio in order to
+ * communicate with the rest of the mm. Please use this helper function
+ * instead of casting yourself, as the implementation may change in the future.
+ */
+#define zpdesc_folio(zp) (_Generic((zp), \
+ const struct zpdesc *: (const struct folio *)(zp), \
+ struct zpdesc *: (struct folio *)(zp)))
+/**
+ * page_zpdesc - Converts from first struct page to zpdesc.
+ * @p: The first (either head of compound or single) page of zpdesc.
+ *
+ * A temporary wrapper to convert struct page to struct zpdesc in situations
+ * where we know the page is the compound head, or single order-0 page.
+ *
+ * Long-term ideally everything would work with struct zpdesc directly or go
+ * through folio to struct zpdesc.
+ *
+ * Return: The zpdesc which contains this page
+ */
+#define page_zpdesc(p) (_Generic((p), \
+ const struct page *: (const struct zpdesc *)(p), \
+ struct page *: (struct zpdesc *)(p)))
+
+static inline void zpdesc_lock(struct zpdesc *zpdesc)
+{
+ folio_lock(zpdesc_folio(zpdesc));
+}
+
+static inline bool zpdesc_trylock(struct zpdesc *zpdesc)
+{
+ return folio_trylock(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_unlock(struct zpdesc *zpdesc)
+{
+ folio_unlock(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_wait_locked(struct zpdesc *zpdesc)
+{
+ folio_wait_locked(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_get(struct zpdesc *zpdesc)
+{
+ folio_get(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_put(struct zpdesc *zpdesc)
+{
+ folio_put(zpdesc_folio(zpdesc));
+}
+
+static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc)
+{
+ return kmap_local_page(zpdesc_page(zpdesc));
+}
+
+static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc)
+{
+ return page_to_pfn(zpdesc_page(zpdesc));
+}
+
+static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
+{
+ return page_zpdesc(pfn_to_page(pfn));
+}
+
+static inline void __zpdesc_set_movable(struct zpdesc *zpdesc,
+ const struct movable_operations *mops)
+{
+ __SetPageMovable(zpdesc_page(zpdesc), mops);
+}
+
+static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
+{
+ __SetPageZsmalloc(zpdesc_page(zpdesc));
+}
+
+static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc)
+{
+ __ClearPageZsmalloc(zpdesc_page(zpdesc));
+}
+
+static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc)
+{
+ return PageIsolated(zpdesc_page(zpdesc));
+}
+
+static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc)
+{
+ return page_zone(zpdesc_page(zpdesc));
+}
+
+static inline bool zpdesc_is_locked(struct zpdesc *zpdesc)
+{
+ return folio_test_locked(zpdesc_folio(zpdesc));
+}
+#endif
diff --git a/mm/zpool.c b/mm/zpool.c
index b9fda1fa857d..0a71d03369f1 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -95,7 +95,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_has_pool() - Check if the pool driver is available
- * @type: The type of the zpool to check (e.g. zbud, zsmalloc)
+ * @type: The type of the zpool to check (e.g. zsmalloc)
*
* This checks if the @type pool driver is available. This will try to load
* the requested module, if needed, but there is no guarantee the module will
@@ -130,7 +130,7 @@ EXPORT_SYMBOL(zpool_has_pool);
/**
* zpool_create_pool() - Create a new zpool
- * @type: The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @type: The type of the zpool to create (e.g. zsmalloc)
* @name: The name of the zpool (e.g. zram0, zswap)
* @gfp: The GFP flags to use when allocating the pool.
*
@@ -221,41 +221,27 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
- * zpool_malloc_support_movable() - Check if the zpool supports
- * allocating movable memory
- * @zpool: The zpool to check
- *
- * This returns if the zpool supports allocating movable memory.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: true if the zpool supports allocating movable memory, false if not
- */
-bool zpool_malloc_support_movable(struct zpool *zpool)
-{
- return zpool->driver->malloc_support_movable;
-}
-
-/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
* @size: The amount of memory to allocate.
* @gfp: The GFP flags to use when allocating memory.
* @handle: Pointer to the handle to set
+ * @nid: The preferred node id.
*
* This allocates the requested amount of memory from the pool.
* The gfp flags will be used when allocating memory, if the
* implementation supports it. The provided @handle will be
- * set to the allocated object handle.
+ * set to the allocated object handle. The allocation will
+ * prefer the NUMA node specified by @nid.
*
* Implementations must guarantee this to be thread-safe.
*
* Returns: 0 on success, negative value on error.
*/
int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+ return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid);
}
/**
@@ -278,46 +264,51 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_map_handle() - Map a previously allocated handle into memory
+ * zpool_obj_read_begin() - Start reading from a previously allocated handle.
* @zpool: The zpool that the handle was allocated from
- * @handle: The handle to map
- * @mapmode: How the memory should be mapped
+ * @handle: The handle to read from
+ * @local_copy: A local buffer to use if needed.
*
- * This maps a previously allocated handle into memory. The @mapmode
- * param indicates to the implementation how the memory will be
- * used, i.e. read-only, write-only, read-write. If the
- * implementation does not support it, the memory will be treated
- * as read-write.
+ * This starts a read operation of a previously allocated handle. The passed
+ * @local_copy buffer may be used if needed by copying the memory into.
+ * zpool_obj_read_end() MUST be called after the read is completed to undo any
+ * actions taken (e.g. release locks).
*
- * This may hold locks, disable interrupts, and/or preemption,
- * and the zpool_unmap_handle() must be called to undo those
- * actions. The code that uses the mapped handle should complete
- * its operations on the mapped handle memory quickly and unmap
- * as soon as possible. As the implementation may use per-cpu
- * data, multiple handles should not be mapped concurrently on
- * any cpu.
+ * Returns: A pointer to the handle memory to be read, if @local_copy is used,
+ * the returned pointer is @local_copy.
+ */
+void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
+ void *local_copy)
+{
+ return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy);
+}
+
+/**
+ * zpool_obj_read_end() - Finish reading from a previously allocated handle.
+ * @zpool: The zpool that the handle was allocated from
+ * @handle: The handle to read from
+ * @handle_mem: The pointer returned by zpool_obj_read_begin()
*
- * Returns: A pointer to the handle's mapped memory area.
+ * Finishes a read operation previously started by zpool_obj_read_begin().
*/
-void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
- enum zpool_mapmode mapmode)
+void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
+ void *handle_mem)
{
- return zpool->driver->map(zpool->pool, handle, mapmode);
+ zpool->driver->obj_read_end(zpool->pool, handle, handle_mem);
}
/**
- * zpool_unmap_handle() - Unmap a previously mapped handle
+ * zpool_obj_write() - Write to a previously allocated handle.
* @zpool: The zpool that the handle was allocated from
- * @handle: The handle to unmap
+ * @handle: The handle to read from
+ * @handle_mem: The memory to copy from into the handle.
+ * @mem_len: The length of memory to be written.
*
- * This unmaps a previously mapped handle. Any locks or other
- * actions that the implementation took in zpool_map_handle()
- * will be undone here. The memory area returned from
- * zpool_map_handle() should no longer be used after this.
*/
-void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
+void zpool_obj_write(struct zpool *zpool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
{
- zpool->driver->unmap(zpool->pool, handle);
+ zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len);
}
/**
@@ -333,23 +324,5 @@ u64 zpool_get_total_pages(struct zpool *zpool)
return zpool->driver->total_pages(zpool->pool);
}
-/**
- * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
- * @zpool: The zpool to test
- *
- * Some allocators enter non-preemptible context in ->map() callback (e.g.
- * disable pagefaults) and exit that context in ->unmap(), which limits what
- * we can do with the mapped object. For instance, we cannot wait for
- * asynchronous crypto API to decompress such an object or take mutexes
- * since those will call into the scheduler. This function tells us whether
- * we use such an allocator.
- *
- * Returns: true if zpool can sleep; false otherwise.
- */
-bool zpool_can_sleep_mapped(struct zpool *zpool)
-{
- return zpool->driver->sleep_mapped;
-}
-
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 64b66a4d3e6e..999b513c7fdf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -13,30 +13,12 @@
* Released under the terms of GNU General Public License Version 2.0
*/
-/*
- * Following is how we use various fields and flags of underlying
- * struct page(s) to form a zspage.
- *
- * Usage of struct page fields:
- * page->private: points to zspage
- * page->index: links together all component pages of a zspage
- * For the huge page, this is always 0, so we use this field
- * to store handle.
- * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object
- * offset in a subpage of a zspage
- *
- * Usage of struct page flags:
- * PG_private: identifies the first component page
- * PG_owner_priv_1: identifies the huge component page
- *
- */
-
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/*
* lock ordering:
* page_lock
- * pool->migrate_lock
+ * pool->lock
* class->lock
* zspage->lock
*/
@@ -44,17 +26,10 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/vmalloc.h>
-#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/sprintf.h>
#include <linux/shrinker.h>
@@ -62,11 +37,9 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
-#include <linux/migrate.h>
-#include <linux/wait.h>
-#include <linux/pagemap.h>
#include <linux/fs.h>
-#include <linux/local_lock.h>
+#include <linux/workqueue.h>
+#include "zpdesc.h"
#define ZSPAGE_MAGIC 0x58
@@ -240,11 +213,49 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct work_struct free_work;
#endif
- /* protect page/zspage migration */
- rwlock_t migrate_lock;
+ /* protect zspage migration/compaction */
+ rwlock_t lock;
atomic_t compaction_in_progress;
};
+static inline void zpdesc_set_first(struct zpdesc *zpdesc)
+{
+ SetPagePrivate(zpdesc_page(zpdesc));
+}
+
+static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc)
+{
+ inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
+}
+
+static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
+{
+ dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
+}
+
+static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid)
+{
+ struct page *page = alloc_pages_node(nid, gfp, 0);
+
+ return page_zpdesc(page);
+}
+
+static inline void free_zpdesc(struct zpdesc *zpdesc)
+{
+ struct page *page = zpdesc_page(zpdesc);
+
+ __free_page(page);
+}
+
+#define ZS_PAGE_UNLOCKED 0
+#define ZS_PAGE_WRLOCKED -1
+
+struct zspage_lock {
+ spinlock_t lock;
+ int cnt;
+ struct lockdep_map dep_map;
+};
+
struct zspage {
struct {
unsigned int huge:HUGE_BITS;
@@ -254,18 +265,89 @@ struct zspage {
};
unsigned int inuse;
unsigned int freeobj;
- struct page *first_page;
+ struct zpdesc *first_zpdesc;
struct list_head list; /* fullness list */
struct zs_pool *pool;
- rwlock_t lock;
+ struct zspage_lock zsl;
};
-struct mapping_area {
- local_lock_t lock;
- char *vm_buf; /* copy buffer for objects that span pages */
- char *vm_addr; /* address of kmap_local_page()'ed pages */
- enum zs_mapmode vm_mm; /* mapping mode */
-};
+static void zspage_lock_init(struct zspage *zspage)
+{
+ static struct lock_class_key __key;
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0);
+ spin_lock_init(&zsl->lock);
+ zsl->cnt = ZS_PAGE_UNLOCKED;
+}
+
+/*
+ * The zspage lock can be held from atomic contexts, but it needs to remain
+ * preemptible when held for reading because it remains held outside of those
+ * atomic contexts, otherwise we unnecessarily lose preemptibility.
+ *
+ * To achieve this, the following rules are enforced on readers and writers:
+ *
+ * - Writers are blocked by both writers and readers, while readers are only
+ * blocked by writers (i.e. normal rwlock semantics).
+ *
+ * - Writers are always atomic (to allow readers to spin waiting for them).
+ *
+ * - Writers always use trylock (as the lock may be held be sleeping readers).
+ *
+ * - Readers may spin on the lock (as they can only wait for atomic writers).
+ *
+ * - Readers may sleep while holding the lock (as writes only use trylock).
+ */
+static void zspage_read_lock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_);
+
+ spin_lock(&zsl->lock);
+ zsl->cnt++;
+ spin_unlock(&zsl->lock);
+
+ lock_acquired(&zsl->dep_map, _RET_IP_);
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_release(&zsl->dep_map, _RET_IP_);
+
+ spin_lock(&zsl->lock);
+ zsl->cnt--;
+ spin_unlock(&zsl->lock);
+}
+
+static __must_check bool zspage_write_trylock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ spin_lock(&zsl->lock);
+ if (zsl->cnt == ZS_PAGE_UNLOCKED) {
+ zsl->cnt = ZS_PAGE_WRLOCKED;
+ rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_);
+ lock_acquired(&zsl->dep_map, _RET_IP_);
+ return true;
+ }
+
+ spin_unlock(&zsl->lock);
+ return false;
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_release(&zsl->dep_map, _RET_IP_);
+
+ zsl->cnt = ZS_PAGE_UNLOCKED;
+ spin_unlock(&zsl->lock);
+}
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
static void SetZsHugePage(struct zspage *zspage)
@@ -278,12 +360,6 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-static void migrate_lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
-
#ifdef CONFIG_COMPACTION
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
@@ -376,9 +452,9 @@ static void zs_zpool_destroy(void *pool)
}
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- *handle = zs_malloc(pool, size, gfp);
+ *handle = zs_malloc(pool, size, gfp, nid);
if (IS_ERR_VALUE(*handle))
return PTR_ERR((void *)*handle);
@@ -389,29 +465,22 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static void *zs_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
+static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle,
+ void *local_copy)
{
- enum zs_mapmode zs_mm;
-
- switch (mm) {
- case ZPOOL_MM_RO:
- zs_mm = ZS_MM_RO;
- break;
- case ZPOOL_MM_WO:
- zs_mm = ZS_MM_WO;
- break;
- case ZPOOL_MM_RW:
- default:
- zs_mm = ZS_MM_RW;
- break;
- }
+ return zs_obj_read_begin(pool, handle, local_copy);
+}
- return zs_map_object(pool, handle, zs_mm);
+static void zs_zpool_obj_read_end(void *pool, unsigned long handle,
+ void *handle_mem)
+{
+ zs_obj_read_end(pool, handle, handle_mem);
}
-static void zs_zpool_unmap(void *pool, unsigned long handle)
+
+static void zs_zpool_obj_write(void *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
{
- zs_unmap_object(pool, handle);
+ zs_obj_write(pool, handle, handle_mem, mem_len);
}
static u64 zs_zpool_total_pages(void *pool)
@@ -424,25 +493,20 @@ static struct zpool_driver zs_zpool_driver = {
.owner = THIS_MODULE,
.create = zs_zpool_create,
.destroy = zs_zpool_destroy,
- .malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
+ .obj_read_begin = zs_zpool_obj_read_begin,
+ .obj_read_end = zs_zpool_obj_read_end,
+ .obj_write = zs_zpool_obj_write,
.total_pages = zs_zpool_total_pages,
};
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
-/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
-
-static __maybe_unused int is_first_page(struct page *page)
+static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
{
- return PagePrivate(page);
+ return PagePrivate(zpdesc_page(zpdesc));
}
/* Protected by class->lock */
@@ -451,36 +515,35 @@ static inline int get_zspage_inuse(struct zspage *zspage)
return zspage->inuse;
}
-
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
{
zspage->inuse += val;
}
-static inline struct page *get_first_page(struct zspage *zspage)
+static struct zpdesc *get_first_zpdesc(struct zspage *zspage)
{
- struct page *first_page = zspage->first_page;
+ struct zpdesc *first_zpdesc = zspage->first_zpdesc;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- return first_page;
+ VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc));
+ return first_zpdesc;
}
#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff
-static inline unsigned int get_first_obj_offset(struct page *page)
+static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc)
{
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
- return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK;
+ VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
+ return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
-static inline void set_first_obj_offset(struct page *page, unsigned int offset)
+static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset)
{
/* With 24 bits available, we can support offsets into 16 MiB pages. */
BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
+ VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
- page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK;
- page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
+ zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK;
+ zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -733,52 +796,52 @@ out:
return newfg;
}
-static struct zspage *get_zspage(struct page *page)
+static struct zspage *get_zspage(struct zpdesc *zpdesc)
{
- struct zspage *zspage = (struct zspage *)page_private(page);
+ struct zspage *zspage = zpdesc->zspage;
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
return zspage;
}
-static struct page *get_next_page(struct page *page)
+static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc)
{
- struct zspage *zspage = get_zspage(page);
+ struct zspage *zspage = get_zspage(zpdesc);
if (unlikely(ZsHugePage(zspage)))
return NULL;
- return (struct page *)page->index;
+ return zpdesc->next;
}
/**
- * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value
* @obj: the encoded object value
- * @page: page object resides in zspage
+ * @zpdesc: zpdesc object resides in zspage
* @obj_idx: object index
*/
-static void obj_to_location(unsigned long obj, struct page **page,
+static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc,
unsigned int *obj_idx)
{
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
*obj_idx = (obj & OBJ_INDEX_MASK);
}
-static void obj_to_page(unsigned long obj, struct page **page)
+static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc)
{
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
}
/**
- * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
- * @page: page object resides in zspage
+ * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>)
+ * @zpdesc: zpdesc object resides in zspage
* @obj_idx: object index
*/
-static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
+static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx)
{
unsigned long obj;
- obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS;
obj |= obj_idx & OBJ_INDEX_MASK;
return obj;
@@ -789,15 +852,15 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static inline bool obj_allocated(struct page *page, void *obj,
+static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj,
unsigned long *phandle)
{
unsigned long handle;
- struct zspage *zspage = get_zspage(page);
+ struct zspage *zspage = get_zspage(zpdesc);
if (unlikely(ZsHugePage(zspage))) {
- VM_BUG_ON_PAGE(!is_first_page(page), page);
- handle = page->index;
+ VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc));
+ handle = zpdesc->handle;
} else
handle = *(unsigned long *)obj;
@@ -809,22 +872,24 @@ static inline bool obj_allocated(struct page *page, void *obj,
return true;
}
-static void reset_page(struct page *page)
+static void reset_zpdesc(struct zpdesc *zpdesc)
{
+ struct page *page = zpdesc_page(zpdesc);
+
__ClearPageMovable(page);
ClearPagePrivate(page);
- set_page_private(page, 0);
- page->index = 0;
+ zpdesc->zspage = NULL;
+ zpdesc->next = NULL;
__ClearPageZsmalloc(page);
}
static int trylock_zspage(struct zspage *zspage)
{
- struct page *cursor, *fail;
+ struct zpdesc *cursor, *fail;
- for (cursor = get_first_page(zspage); cursor != NULL; cursor =
- get_next_page(cursor)) {
- if (!trylock_page(cursor)) {
+ for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor =
+ get_next_zpdesc(cursor)) {
+ if (!zpdesc_trylock(cursor)) {
fail = cursor;
goto unlock;
}
@@ -832,9 +897,9 @@ static int trylock_zspage(struct zspage *zspage)
return 1;
unlock:
- for (cursor = get_first_page(zspage); cursor != fail; cursor =
- get_next_page(cursor))
- unlock_page(cursor);
+ for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor =
+ get_next_zpdesc(cursor))
+ zpdesc_unlock(cursor);
return 0;
}
@@ -842,23 +907,23 @@ unlock:
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
- struct page *page, *next;
+ struct zpdesc *zpdesc, *next;
assert_spin_locked(&class->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
- next = page = get_first_page(zspage);
+ next = zpdesc = get_first_zpdesc(zspage);
do {
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- next = get_next_page(page);
- reset_page(page);
- unlock_page(page);
- dec_zone_page_state(page, NR_ZSPAGES);
- put_page(page);
- page = next;
- } while (page != NULL);
+ VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc));
+ next = get_next_zpdesc(zpdesc);
+ reset_zpdesc(zpdesc);
+ zpdesc_unlock(zpdesc);
+ zpdesc_dec_zone_page_state(zpdesc);
+ zpdesc_put(zpdesc);
+ zpdesc = next;
+ } while (zpdesc != NULL);
cache_free_zspage(pool, zspage);
@@ -891,16 +956,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
{
unsigned int freeobj = 1;
unsigned long off = 0;
- struct page *page = get_first_page(zspage);
+ struct zpdesc *zpdesc = get_first_zpdesc(zspage);
- while (page) {
- struct page *next_page;
+ while (zpdesc) {
+ struct zpdesc *next_zpdesc;
struct link_free *link;
void *vaddr;
- set_first_obj_offset(page, off);
+ set_first_obj_offset(zpdesc, off);
- vaddr = kmap_local_page(page);
+ vaddr = kmap_local_zpdesc(zpdesc);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
@@ -913,8 +978,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
* page, which must point to the first object on the next
* page (if present)
*/
- next_page = get_next_page(page);
- if (next_page) {
+ next_zpdesc = get_next_zpdesc(zpdesc);
+ if (next_zpdesc) {
link->next = freeobj++ << OBJ_TAG_BITS;
} else {
/*
@@ -924,7 +989,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
link->next = -1UL << OBJ_TAG_BITS;
}
kunmap_local(vaddr);
- page = next_page;
+ zpdesc = next_zpdesc;
off %= PAGE_SIZE;
}
@@ -932,35 +997,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
}
static void create_page_chain(struct size_class *class, struct zspage *zspage,
- struct page *pages[])
+ struct zpdesc *zpdescs[])
{
int i;
- struct page *page;
- struct page *prev_page = NULL;
- int nr_pages = class->pages_per_zspage;
+ struct zpdesc *zpdesc;
+ struct zpdesc *prev_zpdesc = NULL;
+ int nr_zpdescs = class->pages_per_zspage;
/*
* Allocate individual pages and link them together as:
- * 1. all pages are linked together using page->index
- * 2. each sub-page point to zspage using page->private
+ * 1. all pages are linked together using zpdesc->next
+ * 2. each sub-page point to zspage using zpdesc->zspage
*
- * we set PG_private to identify the first page (i.e. no other sub-page
+ * we set PG_private to identify the first zpdesc (i.e. no other zpdesc
* has this flag set).
*/
- for (i = 0; i < nr_pages; i++) {
- page = pages[i];
- set_page_private(page, (unsigned long)zspage);
- page->index = 0;
+ for (i = 0; i < nr_zpdescs; i++) {
+ zpdesc = zpdescs[i];
+ zpdesc->zspage = zspage;
+ zpdesc->next = NULL;
if (i == 0) {
- zspage->first_page = page;
- SetPagePrivate(page);
+ zspage->first_zpdesc = zpdesc;
+ zpdesc_set_first(zpdesc);
if (unlikely(class->objs_per_zspage == 1 &&
class->pages_per_zspage == 1))
SetZsHugePage(zspage);
} else {
- prev_page->index = (unsigned long)page;
+ prev_zpdesc->next = zpdesc;
}
- prev_page = page;
+ prev_zpdesc = zpdesc;
}
}
@@ -968,42 +1033,42 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
* Allocate a zspage for the given size class
*/
static struct zspage *alloc_zspage(struct zs_pool *pool,
- struct size_class *class,
- gfp_t gfp)
+ struct size_class *class,
+ gfp_t gfp, const int nid)
{
int i;
- struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+ struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
struct zspage *zspage = cache_alloc_zspage(pool, gfp);
if (!zspage)
return NULL;
zspage->magic = ZSPAGE_MAGIC;
- migrate_lock_init(zspage);
+ zspage->pool = pool;
+ zspage->class = class->index;
+ zspage_lock_init(zspage);
for (i = 0; i < class->pages_per_zspage; i++) {
- struct page *page;
+ struct zpdesc *zpdesc;
- page = alloc_page(gfp);
- if (!page) {
+ zpdesc = alloc_zpdesc(gfp, nid);
+ if (!zpdesc) {
while (--i >= 0) {
- dec_zone_page_state(pages[i], NR_ZSPAGES);
- __ClearPageZsmalloc(pages[i]);
- __free_page(pages[i]);
+ zpdesc_dec_zone_page_state(zpdescs[i]);
+ __zpdesc_clear_zsmalloc(zpdescs[i]);
+ free_zpdesc(zpdescs[i]);
}
cache_free_zspage(pool, zspage);
return NULL;
}
- __SetPageZsmalloc(page);
+ __zpdesc_set_zsmalloc(zpdesc);
- inc_zone_page_state(page, NR_ZSPAGES);
- pages[i] = page;
+ zpdesc_inc_zone_page_state(zpdesc);
+ zpdescs[i] = zpdesc;
}
- create_page_chain(class, zspage, pages);
+ create_page_chain(class, zspage, zpdescs);
init_zspage(class, zspage);
- zspage->pool = pool;
- zspage->class = class->index;
return zspage;
}
@@ -1023,93 +1088,6 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
- /*
- * Make sure we don't leak memory if a cpu UP notification
- * and zs_init() race and both call zs_cpu_up() on the same cpu
- */
- if (area->vm_buf)
- return 0;
- area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
- if (!area->vm_buf)
- return -ENOMEM;
- return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
- kfree(area->vm_buf);
- area->vm_buf = NULL;
-}
-
-static void *__zs_map_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- size_t sizes[2];
- char *buf = area->vm_buf;
-
- /* disable page faults to match kmap_local_page() return conditions */
- pagefault_disable();
-
- /* no read fastpath */
- if (area->vm_mm == ZS_MM_WO)
- goto out;
-
- sizes[0] = PAGE_SIZE - off;
- sizes[1] = size - sizes[0];
-
- /* copy object to per-cpu buffer */
- memcpy_from_page(buf, pages[0], off, sizes[0]);
- memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]);
-out:
- return area->vm_buf;
-}
-
-static void __zs_unmap_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- size_t sizes[2];
- char *buf;
-
- /* no write fastpath */
- if (area->vm_mm == ZS_MM_RO)
- goto out;
-
- buf = area->vm_buf;
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
-
- sizes[0] = PAGE_SIZE - off;
- sizes[1] = size - sizes[0];
-
- /* copy per-cpu buffer to object */
- memcpy_to_page(pages[0], off, buf, sizes[0]);
- memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]);
-
-out:
- /* enable page faults to match kunmap_local() return conditions */
- pagefault_enable();
-}
-
-static int zs_cpu_prepare(unsigned int cpu)
-{
- struct mapping_area *area;
-
- area = &per_cpu(zs_map_area, cpu);
- return __zs_cpu_up(area);
-}
-
-static int zs_cpu_dead(unsigned int cpu)
-{
- struct mapping_area *area;
-
- area = &per_cpu(zs_map_area, cpu);
- __zs_cpu_down(area);
- return 0;
-}
-
static bool can_merge(struct size_class *prev, int pages_per_zspage,
int objs_per_zspage)
{
@@ -1157,116 +1135,130 @@ unsigned long zs_get_total_pages(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_get_total_pages);
-/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- * @mm: mapping mode to use
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
- *
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
- *
- * This function returns with preemption and page faults disabled.
- */
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
+ void *local_copy)
{
struct zspage *zspage;
- struct page *page;
+ struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
-
struct size_class *class;
- struct mapping_area *area;
- struct page *pages[2];
- void *ret;
-
- /*
- * Because we use per-cpu mapping areas shared among the
- * pools/users, we can't allow mapping in interrupt context
- * because it can corrupt another users mappings.
- */
- BUG_ON(in_interrupt());
+ void *addr;
- /* It guarantees it can get zspage from handle safely */
- read_lock(&pool->migrate_lock);
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
obj = handle_to_obj(handle);
- obj_to_location(obj, &page, &obj_idx);
- zspage = get_zspage(page);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
- /*
- * migration cannot move any zpages in this zspage. Here, class->lock
- * is too heavy since callers would take some time until they calls
- * zs_unmap_object API so delegate the locking from class to zspage
- * which is smaller granularity.
- */
- migrate_read_lock(zspage);
- read_unlock(&pool->migrate_lock);
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- local_lock(&zs_map_area.lock);
- area = this_cpu_ptr(&zs_map_area);
- area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
- area->vm_addr = kmap_local_page(page);
- ret = area->vm_addr + off;
- goto out;
+ addr = kmap_local_zpdesc(zpdesc);
+ addr += off;
+ } else {
+ size_t sizes[2];
+
+ /* this object spans two pages */
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = class->size - sizes[0];
+ addr = local_copy;
+
+ memcpy_from_page(addr, zpdesc_page(zpdesc),
+ off, sizes[0]);
+ zpdesc = get_next_zpdesc(zpdesc);
+ memcpy_from_page(addr + sizes[0],
+ zpdesc_page(zpdesc),
+ 0, sizes[1]);
}
- /* this object spans two pages */
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
-
- ret = __zs_map_object(area, pages, off, class->size);
-out:
- if (likely(!ZsHugePage(zspage)))
- ret += ZS_HANDLE_SIZE;
+ if (!ZsHugePage(zspage))
+ addr += ZS_HANDLE_SIZE;
- return ret;
+ return addr;
}
-EXPORT_SYMBOL_GPL(zs_map_object);
+EXPORT_SYMBOL_GPL(zs_obj_read_begin);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem)
{
struct zspage *zspage;
- struct page *page;
+ struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
-
struct size_class *class;
- struct mapping_area *area;
obj = handle_to_obj(handle);
- obj_to_location(obj, &page, &obj_idx);
- zspage = get_zspage(page);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- area = this_cpu_ptr(&zs_map_area);
- if (off + class->size <= PAGE_SIZE)
- kunmap_local(area->vm_addr);
- else {
- struct page *pages[2];
+ if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+ handle_mem -= off;
+ kunmap_local(handle_mem);
+ }
+
+ zspage_read_unlock(zspage);
+}
+EXPORT_SYMBOL_GPL(zs_obj_read_end);
+
+void zs_obj_write(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
+{
+ struct zspage *zspage;
+ struct zpdesc *zpdesc;
+ unsigned long obj, off;
+ unsigned int obj_idx;
+ struct size_class *class;
+
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
+
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
+
+ class = zspage_class(pool, zspage);
+ off = offset_in_page(class->size * obj_idx);
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
- __zs_unmap_object(area, pages, off, class->size);
+ if (off + mem_len <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ void *dst = kmap_local_zpdesc(zpdesc);
+
+ memcpy(dst + off, handle_mem, mem_len);
+ kunmap_local(dst);
+ } else {
+ /* this object spans two pages */
+ size_t sizes[2];
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = mem_len - sizes[0];
+
+ memcpy_to_page(zpdesc_page(zpdesc), off,
+ handle_mem, sizes[0]);
+ zpdesc = get_next_zpdesc(zpdesc);
+ memcpy_to_page(zpdesc_page(zpdesc), 0,
+ handle_mem + sizes[0], sizes[1]);
}
- local_unlock(&zs_map_area.lock);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
+EXPORT_SYMBOL_GPL(zs_obj_write);
/**
* zs_huge_class_size() - Returns the size (in bytes) of the first huge
@@ -1290,12 +1282,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size);
static unsigned long obj_malloc(struct zs_pool *pool,
struct zspage *zspage, unsigned long handle)
{
- int i, nr_page, offset;
+ int i, nr_zpdesc, offset;
unsigned long obj;
struct link_free *link;
struct size_class *class;
- struct page *m_page;
+ struct zpdesc *m_zpdesc;
unsigned long m_offset;
void *vaddr;
@@ -1303,27 +1295,26 @@ static unsigned long obj_malloc(struct zs_pool *pool,
obj = get_freeobj(zspage);
offset = obj * class->size;
- nr_page = offset >> PAGE_SHIFT;
+ nr_zpdesc = offset >> PAGE_SHIFT;
m_offset = offset_in_page(offset);
- m_page = get_first_page(zspage);
+ m_zpdesc = get_first_zpdesc(zspage);
- for (i = 0; i < nr_page; i++)
- m_page = get_next_page(m_page);
+ for (i = 0; i < nr_zpdesc; i++)
+ m_zpdesc = get_next_zpdesc(m_zpdesc);
- vaddr = kmap_local_page(m_page);
+ vaddr = kmap_local_zpdesc(m_zpdesc);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
if (likely(!ZsHugePage(zspage)))
/* record handle in the header of allocated chunk */
link->handle = handle | OBJ_ALLOCATED_TAG;
else
- /* record handle to page->index */
- zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
+ zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG;
kunmap_local(vaddr);
mod_zspage_inuse(zspage, 1);
- obj = location_to_obj(m_page, obj);
+ obj = location_to_obj(m_zpdesc, obj);
record_obj(handle, obj);
return obj;
@@ -1335,12 +1326,14 @@ static unsigned long obj_malloc(struct zs_pool *pool,
* @pool: pool to allocate from
* @size: size of block to allocate
* @gfp: gfp flags when allocating object
+ * @nid: The preferred node id to allocate new zspage (if needed)
*
* On success, handle to the allocated object is returned,
* otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
+ const int nid)
{
unsigned long handle;
struct size_class *class;
@@ -1375,7 +1368,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
spin_unlock(&class->lock);
- zspage = alloc_zspage(pool, class, gfp);
+ zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
cache_free_handle(pool, handle);
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1402,23 +1395,24 @@ static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
- struct page *f_page;
+ struct zpdesc *f_zpdesc;
unsigned long f_offset;
unsigned int f_objidx;
void *vaddr;
- obj_to_location(obj, &f_page, &f_objidx);
+
+ obj_to_location(obj, &f_zpdesc, &f_objidx);
f_offset = offset_in_page(class_size * f_objidx);
- zspage = get_zspage(f_page);
+ zspage = get_zspage(f_zpdesc);
- vaddr = kmap_local_page(f_page);
+ vaddr = kmap_local_zpdesc(f_zpdesc);
link = (struct link_free *)(vaddr + f_offset);
/* Insert this object in containing zspage's freelist */
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
- f_page->index = 0;
+ f_zpdesc->handle = 0;
set_freeobj(zspage, f_objidx);
kunmap_local(vaddr);
@@ -1428,7 +1422,7 @@ static void obj_free(int class_size, unsigned long obj)
void zs_free(struct zs_pool *pool, unsigned long handle)
{
struct zspage *zspage;
- struct page *f_page;
+ struct zpdesc *f_zpdesc;
unsigned long obj;
struct size_class *class;
int fullness;
@@ -1437,16 +1431,16 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
return;
/*
- * The pool->migrate_lock protects the race with zpage's migration
+ * The pool->lock protects the race with zpage's migration
* so it's safe to get the page from handle.
*/
- read_lock(&pool->migrate_lock);
+ read_lock(&pool->lock);
obj = handle_to_obj(handle);
- obj_to_page(obj, &f_page);
- zspage = get_zspage(f_page);
+ obj_to_zpdesc(obj, &f_zpdesc);
+ zspage = get_zspage(f_zpdesc);
class = zspage_class(pool, zspage);
spin_lock(&class->lock);
- read_unlock(&pool->migrate_lock);
+ read_unlock(&pool->lock);
class_stat_sub(class, ZS_OBJS_INUSE, 1);
obj_free(class->size, obj);
@@ -1463,7 +1457,7 @@ EXPORT_SYMBOL_GPL(zs_free);
static void zs_object_copy(struct size_class *class, unsigned long dst,
unsigned long src)
{
- struct page *s_page, *d_page;
+ struct zpdesc *s_zpdesc, *d_zpdesc;
unsigned int s_objidx, d_objidx;
unsigned long s_off, d_off;
void *s_addr, *d_addr;
@@ -1472,8 +1466,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
s_size = d_size = class->size;
- obj_to_location(src, &s_page, &s_objidx);
- obj_to_location(dst, &d_page, &d_objidx);
+ obj_to_location(src, &s_zpdesc, &s_objidx);
+ obj_to_location(dst, &d_zpdesc, &d_objidx);
s_off = offset_in_page(class->size * s_objidx);
d_off = offset_in_page(class->size * d_objidx);
@@ -1484,8 +1478,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (d_off + class->size > PAGE_SIZE)
d_size = PAGE_SIZE - d_off;
- s_addr = kmap_local_page(s_page);
- d_addr = kmap_local_page(d_page);
+ s_addr = kmap_local_zpdesc(s_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
while (1) {
size = min(s_size, d_size);
@@ -1510,17 +1504,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (s_off >= PAGE_SIZE) {
kunmap_local(d_addr);
kunmap_local(s_addr);
- s_page = get_next_page(s_page);
- s_addr = kmap_local_page(s_page);
- d_addr = kmap_local_page(d_page);
+ s_zpdesc = get_next_zpdesc(s_zpdesc);
+ s_addr = kmap_local_zpdesc(s_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
s_size = class->size - written;
s_off = 0;
}
if (d_off >= PAGE_SIZE) {
kunmap_local(d_addr);
- d_page = get_next_page(d_page);
- d_addr = kmap_local_page(d_page);
+ d_zpdesc = get_next_zpdesc(d_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
d_size = class->size - written;
d_off = 0;
}
@@ -1535,18 +1529,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+ struct zpdesc *zpdesc, int *obj_idx)
{
unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
- void *addr = kmap_local_page(page);
+ void *addr = kmap_local_zpdesc(zpdesc);
- offset = get_first_obj_offset(page);
+ offset = get_first_obj_offset(zpdesc);
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_allocated(page, addr + offset, &handle))
+ if (obj_allocated(zpdesc, addr + offset, &handle))
break;
offset += class->size;
@@ -1566,14 +1560,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
unsigned long used_obj, free_obj;
unsigned long handle;
int obj_idx = 0;
- struct page *s_page = get_first_page(src_zspage);
+ struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage);
struct size_class *class = pool->size_class[src_zspage->class];
while (1) {
- handle = find_alloced_obj(class, s_page, &obj_idx);
+ handle = find_alloced_obj(class, s_zpdesc, &obj_idx);
if (!handle) {
- s_page = get_next_page(s_page);
- if (!s_page)
+ s_zpdesc = get_next_zpdesc(s_zpdesc);
+ if (!s_zpdesc)
break;
obj_idx = 0;
continue;
@@ -1653,93 +1647,70 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
*/
static void lock_zspage(struct zspage *zspage)
{
- struct page *curr_page, *page;
+ struct zpdesc *curr_zpdesc, *zpdesc;
/*
* Pages we haven't locked yet can be migrated off the list while we're
* trying to lock them, so we need to be careful and only attempt to
- * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * lock each page under zspage_read_lock(). Otherwise, the page we lock
* may no longer belong to the zspage. This means that we may wait for
* the wrong page to unlock, so we must take a reference to the page
- * prior to waiting for it to unlock outside migrate_read_lock().
+ * prior to waiting for it to unlock outside zspage_read_lock().
*/
while (1) {
- migrate_read_lock(zspage);
- page = get_first_page(zspage);
- if (trylock_page(page))
+ zspage_read_lock(zspage);
+ zpdesc = get_first_zpdesc(zspage);
+ if (zpdesc_trylock(zpdesc))
break;
- get_page(page);
- migrate_read_unlock(zspage);
- wait_on_page_locked(page);
- put_page(page);
+ zpdesc_get(zpdesc);
+ zspage_read_unlock(zspage);
+ zpdesc_wait_locked(zpdesc);
+ zpdesc_put(zpdesc);
}
- curr_page = page;
- while ((page = get_next_page(curr_page))) {
- if (trylock_page(page)) {
- curr_page = page;
+ curr_zpdesc = zpdesc;
+ while ((zpdesc = get_next_zpdesc(curr_zpdesc))) {
+ if (zpdesc_trylock(zpdesc)) {
+ curr_zpdesc = zpdesc;
} else {
- get_page(page);
- migrate_read_unlock(zspage);
- wait_on_page_locked(page);
- put_page(page);
- migrate_read_lock(zspage);
+ zpdesc_get(zpdesc);
+ zspage_read_unlock(zspage);
+ zpdesc_wait_locked(zpdesc);
+ zpdesc_put(zpdesc);
+ zspage_read_lock(zspage);
}
}
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
#endif /* CONFIG_COMPACTION */
-static void migrate_lock_init(struct zspage *zspage)
-{
- rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
- read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
- read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
- write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
- write_unlock(&zspage->lock);
-}
-
#ifdef CONFIG_COMPACTION
static const struct movable_operations zsmalloc_mops;
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
- struct page *newpage, struct page *oldpage)
+ struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
{
- struct page *page;
- struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ struct zpdesc *zpdesc;
+ struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ unsigned int first_obj_offset;
int idx = 0;
- page = get_first_page(zspage);
+ zpdesc = get_first_zpdesc(zspage);
do {
- if (page == oldpage)
- pages[idx] = newpage;
+ if (zpdesc == oldzpdesc)
+ zpdescs[idx] = newzpdesc;
else
- pages[idx] = page;
+ zpdescs[idx] = zpdesc;
idx++;
- } while ((page = get_next_page(page)) != NULL);
+ } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
- create_page_chain(class, zspage, pages);
- set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+ create_page_chain(class, zspage, zpdescs);
+ first_obj_offset = get_first_obj_offset(oldzpdesc);
+ set_first_obj_offset(newzpdesc, first_obj_offset);
if (unlikely(ZsHugePage(zspage)))
- newpage->index = oldpage->index;
- __SetPageMovable(newpage, &zsmalloc_mops);
+ newzpdesc->handle = oldzpdesc->handle;
+ __zpdesc_set_movable(newzpdesc, &zsmalloc_mops);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
@@ -1759,76 +1730,81 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
struct zs_pool *pool;
struct size_class *class;
struct zspage *zspage;
- struct page *dummy;
+ struct zpdesc *dummy;
+ struct zpdesc *newzpdesc = page_zpdesc(newpage);
+ struct zpdesc *zpdesc = page_zpdesc(page);
void *s_addr, *d_addr, *addr;
unsigned int offset;
unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
- /* We're committed, tell the world that this is a Zsmalloc page. */
- __SetPageZsmalloc(newpage);
+ VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
/* The page is locked, so this pointer must remain valid */
- zspage = get_zspage(page);
+ zspage = get_zspage(zpdesc);
pool = zspage->pool;
/*
* The pool migrate_lock protects the race between zpage migration
* and zs_free.
*/
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
class = zspage_class(pool, zspage);
/*
* the class lock protects zpage alloc/free in the zspage.
*/
spin_lock(&class->lock);
- /* the migrate_write_lock protects zpage access via zs_map_object */
- migrate_write_lock(zspage);
+ /* the zspage write_lock protects zpage access via zs_obj_read/write() */
+ if (!zspage_write_trylock(zspage)) {
+ spin_unlock(&class->lock);
+ write_unlock(&pool->lock);
+ return -EINVAL;
+ }
- offset = get_first_obj_offset(page);
- s_addr = kmap_local_page(page);
+ /* We're committed, tell the world that this is a Zsmalloc page. */
+ __zpdesc_set_zsmalloc(newzpdesc);
+
+ offset = get_first_obj_offset(zpdesc);
+ s_addr = kmap_local_zpdesc(zpdesc);
/*
* Here, any user cannot access all objects in the zspage so let's move.
*/
- d_addr = kmap_local_page(newpage);
+ d_addr = kmap_local_zpdesc(newzpdesc);
copy_page(d_addr, s_addr);
kunmap_local(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
- if (obj_allocated(page, addr, &handle)) {
+ if (obj_allocated(zpdesc, addr, &handle)) {
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
- new_obj = (unsigned long)location_to_obj(newpage,
- obj_idx);
+ new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx);
record_obj(handle, new_obj);
}
}
kunmap_local(s_addr);
- replace_sub_page(class, zspage, newpage, page);
+ replace_sub_page(class, zspage, newzpdesc, zpdesc);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release migration_lock.
*/
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
spin_unlock(&class->lock);
- migrate_write_unlock(zspage);
+ zspage_write_unlock(zspage);
- get_page(newpage);
- if (page_zone(newpage) != page_zone(page)) {
- dec_zone_page_state(page, NR_ZSPAGES);
- inc_zone_page_state(newpage, NR_ZSPAGES);
+ zpdesc_get(newzpdesc);
+ if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
+ zpdesc_dec_zone_page_state(zpdesc);
+ zpdesc_inc_zone_page_state(newzpdesc);
}
- reset_page(page);
- put_page(page);
+ reset_zpdesc(zpdesc);
+ zpdesc_put(zpdesc);
return MIGRATEPAGE_SUCCESS;
}
@@ -1897,13 +1873,13 @@ static void init_deferred_free(struct zs_pool *pool)
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
{
- struct page *page = get_first_page(zspage);
+ struct zpdesc *zpdesc = get_first_zpdesc(zspage);
do {
- WARN_ON(!trylock_page(page));
- __SetPageMovable(page, &zsmalloc_mops);
- unlock_page(page);
- } while ((page = get_next_page(page)) != NULL);
+ WARN_ON(!zpdesc_trylock(zpdesc));
+ __zpdesc_set_movable(zpdesc, &zsmalloc_mops);
+ zpdesc_unlock(zpdesc);
+ } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
}
#else
static inline void zs_flush_migration(struct zs_pool *pool) { }
@@ -1940,7 +1916,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
* protect the race between zpage migration and zs_free
* as well as zpage allocation/free
*/
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
spin_lock(&class->lock);
while (zs_can_compact(class)) {
int fg;
@@ -1955,9 +1931,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!src_zspage)
break;
- migrate_write_lock(src_zspage);
+ if (!zspage_write_trylock(src_zspage))
+ break;
+
migrate_zspage(pool, src_zspage, dst_zspage);
- migrate_write_unlock(src_zspage);
+ zspage_write_unlock(src_zspage);
fg = putback_zspage(class, src_zspage);
if (fg == ZS_INUSE_RATIO_0) {
@@ -1967,14 +1945,14 @@ static unsigned long __zs_compact(struct zs_pool *pool,
src_zspage = NULL;
if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
- || rwlock_is_contended(&pool->migrate_lock)) {
+ || rwlock_is_contended(&pool->lock)) {
putback_zspage(class, dst_zspage);
dst_zspage = NULL;
spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
cond_resched();
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
spin_lock(&class->lock);
}
}
@@ -1986,7 +1964,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(class, dst_zspage);
spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
return pages_freed;
}
@@ -1998,10 +1976,10 @@ unsigned long zs_compact(struct zs_pool *pool)
unsigned long pages_freed = 0;
/*
- * Pool compaction is performed under pool->migrate_lock so it is basically
+ * Pool compaction is performed under pool->lock so it is basically
* single-threaded. Having more than one thread in __zs_compact()
- * will increase pool->migrate_lock contention, which will impact other
- * zsmalloc operations that need pool->migrate_lock.
+ * will increase pool->lock contention, which will impact other
+ * zsmalloc operations that need pool->lock.
*/
if (atomic_xchg(&pool->compaction_in_progress, 1))
return 0;
@@ -2123,7 +2101,7 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- rwlock_init(&pool->migrate_lock);
+ rwlock_init(&pool->lock);
atomic_set(&pool->compaction_in_progress, 0);
pool->name = kstrdup(name, GFP_KERNEL);
@@ -2262,23 +2240,11 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
static int __init zs_init(void)
{
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
- zs_cpu_prepare, zs_cpu_dead);
- if (ret)
- goto out;
-
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
-
zs_stat_init();
-
return 0;
-
-out:
- return ret;
}
static void __exit zs_exit(void)
@@ -2286,8 +2252,6 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
- cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
-
zs_stat_exit();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..455e9425c5f5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
static u64 zswap_reject_compress_fail;
/* Compressed page was too big for the allocator to (optimally) store */
static u64 zswap_reject_compress_poor;
+/* Load or writeback failed due to decompression failure */
+static u64 zswap_decompress_fail;
/* Store failed because underlying allocator could not get memory */
static u64 zswap_reject_alloc_fail;
/* Store failed because the entry metadata could not be allocated (rare) */
@@ -251,7 +253,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +287,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -817,36 +822,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -855,12 +865,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
@@ -868,18 +883,60 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct acomp_req *req;
+ struct crypto_acomp *acomp;
+ u8 *buffer;
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return 0;
+
+ mutex_lock(&acomp_ctx->mutex);
+ req = acomp_ctx->req;
+ acomp = acomp_ctx->acomp;
+ buffer = acomp_ctx->buffer;
+ acomp_ctx->req = NULL;
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->buffer = NULL;
+ mutex_unlock(&acomp_ctx->mutex);
+
+ /*
+ * Do the actual freeing after releasing the mutex to avoid subtle
+ * locking dependencies causing deadlocks.
+ */
+ if (!IS_ERR_OR_NULL(req))
+ acomp_request_free(req);
+ if (!IS_ERR_OR_NULL(acomp))
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return 0;
}
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
+}
+
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
+{
+ mutex_unlock(&acomp_ctx->mutex);
+}
+
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -889,14 +946,10 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
unsigned int dlen = PAGE_SIZE;
unsigned long handle;
struct zpool *zpool;
- char *buf;
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -927,17 +980,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
goto unlock;
zpool = pool->zpool;
- gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(zpool))
- gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
+ alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page));
if (alloc_ret)
goto unlock;
- buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, dst, dlen);
- zpool_unmap_handle(zpool, handle);
-
+ zpool_obj_write(zpool, handle, dst, dlen);
entry->handle = handle;
entry->length = dlen;
@@ -949,47 +997,53 @@ unlock:
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
-static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
+static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
{
struct zpool *zpool = entry->pool->zpool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src;
+ int decomp_ret, dlen;
+ u8 *src, *obj;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
+ obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
- src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
- * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
- * to do crypto_acomp_decompress() which might sleep. In such cases, we must
- * resort to copying the buffer to a temporary one.
- * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer,
- * such as a kmap address of high memory or even ever a vmap address.
- * However, sg_init_one is only equipped to handle linearly mapped low memory.
- * In such cases, we also must copy the buffer to a temporary and lowmem one.
+ * zpool_obj_read_begin() might return a kmap address of highmem when
+ * acomp_ctx->buffer is not used. However, sg_init_one() does not
+ * handle highmem addresses, so copy the object to acomp_ctx->buffer.
*/
- if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
- !virt_addr_valid(src)) {
- memcpy(acomp_ctx->buffer, src, entry->length);
+ if (virt_addr_valid(obj)) {
+ src = obj;
+ } else {
+ WARN_ON_ONCE(obj == acomp_ctx->buffer);
+ memcpy(acomp_ctx->buffer, obj, entry->length);
src = acomp_ctx->buffer;
- zpool_unmap_handle(zpool, entry->handle);
}
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
sg_set_folio(&output, folio, PAGE_SIZE, 0);
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
- BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
- BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
+ decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
- if (src != acomp_ctx->buffer)
- zpool_unmap_handle(zpool, entry->handle);
+ zpool_obj_read_end(zpool, entry->handle, obj);
+ acomp_ctx_put_unlock(acomp_ctx);
+
+ if (!decomp_ret && dlen == PAGE_SIZE)
+ return true;
+
+ zswap_decompress_fail++;
+ pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n",
+ swp_type(entry->swpentry),
+ swp_offset(entry->swpentry),
+ entry->pool->tfm_name, entry->length, dlen);
+ return false;
}
/*********************************
@@ -1015,14 +1069,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
+ struct swap_info_struct *si;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
+ int ret = 0;
/* try to allocate swap cache folio */
+ si = get_swap_device(swpentry);
+ if (!si)
+ return -EEXIST;
+
mpol = get_task_policy(current);
folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ put_swap_device(si);
if (!folio)
return -ENOMEM;
@@ -1034,8 +1095,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* and freed when invalidated by the concurrent shrinker anyway.
*/
if (!folio_was_allocated) {
- folio_put(folio);
- return -EEXIST;
+ ret = -EEXIST;
+ goto out;
}
/*
@@ -1048,14 +1109,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* be dereferenced.
*/
tree = swap_zswap_tree(swpentry);
- if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
- delete_from_swap_cache(folio);
- folio_unlock(folio);
- folio_put(folio);
- return -ENOMEM;
+ if (entry != xa_load(tree, offset)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!zswap_decompress(entry, folio)) {
+ ret = -EIO;
+ goto out;
}
- zswap_decompress(entry, folio);
+ xa_erase(tree, offset);
count_vm_event(ZSWPWB);
if (entry->objcg)
@@ -1071,9 +1135,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/* start writeback */
__swap_writepage(folio, &wbc);
- folio_put(folio);
- return 0;
+out:
+ if (ret && ret != -EEXIST) {
+ delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+ return ret;
}
/*********************************
@@ -1156,7 +1225,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
/*
* It's safe to drop the lock here because we return either
- * LRU_REMOVED_RETRY or LRU_RETRY.
+ * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP.
*/
spin_unlock(&l->lock);
@@ -1409,9 +1478,9 @@ resched:
* main API
**********************************/
-static ssize_t zswap_store_page(struct page *page,
- struct obj_cgroup *objcg,
- struct zswap_pool *pool)
+static bool zswap_store_page(struct page *page,
+ struct obj_cgroup *objcg,
+ struct zswap_pool *pool)
{
swp_entry_t page_swpentry = page_swap_entry(page);
struct zswap_entry *entry, *old;
@@ -1420,7 +1489,7 @@ static ssize_t zswap_store_page(struct page *page,
entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
- return -EINVAL;
+ return false;
}
if (!zswap_compress(page, entry, pool))
@@ -1447,13 +1516,17 @@ static ssize_t zswap_store_page(struct page *page,
/*
* The entry is successfully compressed and stored in the tree, there is
- * no further possibility of failure. Grab refs to the pool and objcg.
- * These refs will be dropped by zswap_entry_free() when the entry is
- * removed from the tree.
+ * no further possibility of failure. Grab refs to the pool and objcg,
+ * charge zswap memory, and increment zswap_stored_pages.
+ * The opposite actions will be performed by zswap_entry_free()
+ * when the entry is removed from the tree.
*/
zswap_pool_get(pool);
- if (objcg)
+ if (objcg) {
obj_cgroup_get(objcg);
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ }
+ atomic_long_inc(&zswap_stored_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1474,13 +1547,13 @@ static ssize_t zswap_store_page(struct page *page,
zswap_lru_add(&zswap_list_lru, entry);
}
- return entry->length;
+ return true;
store_failed:
zpool_free(pool->zpool, entry->handle);
compress_failed:
zswap_entry_cache_free(entry);
- return -EINVAL;
+ return false;
}
bool zswap_store(struct folio *folio)
@@ -1490,7 +1563,6 @@ bool zswap_store(struct folio *folio)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
struct zswap_pool *pool;
- size_t compressed_bytes = 0;
bool ret = false;
long index;
@@ -1528,20 +1600,14 @@ bool zswap_store(struct folio *folio)
for (index = 0; index < nr_pages; ++index) {
struct page *page = folio_page(folio, index);
- ssize_t bytes;
- bytes = zswap_store_page(page, objcg, pool);
- if (bytes < 0)
+ if (!zswap_store_page(page, objcg, pool))
goto put_pool;
- compressed_bytes += bytes;
}
- if (objcg) {
- obj_cgroup_charge_zswap(objcg, compressed_bytes);
+ if (objcg)
count_objcg_events(objcg, ZSWPOUT, nr_pages);
- }
- atomic_long_add(nr_pages, &zswap_stored_pages);
count_vm_events(ZSWPOUT, nr_pages);
ret = true;
@@ -1576,7 +1642,27 @@ check_old:
return ret;
}
-bool zswap_load(struct folio *folio)
+/**
+ * zswap_load() - load a folio from zswap
+ * @folio: folio to load
+ *
+ * Return: 0 on success, with the folio unlocked and marked up-to-date, or one
+ * of the following error codes:
+ *
+ * -EIO: if the swapped out content was in zswap, but could not be loaded
+ * into the page due to a decompression failure. The folio is unlocked, but
+ * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
+ * will SIGBUS).
+ *
+ * -EINVAL: if the swapped out content was in zswap, but the page belongs
+ * to a large folio, which is not supported by zswap. The folio is unlocked,
+ * but NOT marked up-to-date, so that an IO error is emitted (e.g.
+ * do_swap_page() will SIGBUS).
+ *
+ * -ENOENT: if the swapped out content was not in zswap. The folio remains
+ * locked on return.
+ */
+int zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
pgoff_t offset = swp_offset(swp);
@@ -1587,18 +1673,32 @@ bool zswap_load(struct folio *folio)
VM_WARN_ON_ONCE(!folio_test_locked(folio));
if (zswap_never_enabled())
- return false;
+ return -ENOENT;
/*
* Large folios should not be swapped in while zswap is being used, as
* they are not properly handled. Zswap does not properly load large
* folios, and a large folio may only be partially in zswap.
- *
- * Return true without marking the folio uptodate so that an IO error is
- * emitted (e.g. do_swap_page() will sigbus).
*/
- if (WARN_ON_ONCE(folio_test_large(folio)))
- return true;
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ folio_unlock(folio);
+ return -EINVAL;
+ }
+
+ entry = xa_load(tree, offset);
+ if (!entry)
+ return -ENOENT;
+
+ if (!zswap_decompress(entry, folio)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+
+ folio_mark_uptodate(folio);
+
+ count_vm_event(ZSWPIN);
+ if (entry->objcg)
+ count_objcg_events(entry->objcg, ZSWPIN, 1);
/*
* When reading into the swapcache, invalidate our entry. The
@@ -1612,27 +1712,14 @@ bool zswap_load(struct folio *folio)
* files, which reads into a private page and may free it if
* the fault fails. We remain the primary owner of the entry.)
*/
- if (swapcache)
- entry = xa_erase(tree, offset);
- else
- entry = xa_load(tree, offset);
-
- if (!entry)
- return false;
-
- zswap_decompress(entry, folio);
-
- count_vm_event(ZSWPIN);
- if (entry->objcg)
- count_objcg_events(entry->objcg, ZSWPIN, 1);
-
if (swapcache) {
- zswap_entry_free(entry);
folio_mark_dirty(folio);
+ xa_erase(tree, offset);
+ zswap_entry_free(entry);
}
- folio_mark_uptodate(folio);
- return true;
+ folio_unlock(folio);
+ return 0;
}
void zswap_invalidate(swp_entry_t swp)
@@ -1727,6 +1814,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_fail);
debugfs_create_u64("reject_compress_poor", 0444,
zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("decompress_fail", 0444,
+ zswap_debugfs_root, &zswap_decompress_fail);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
debugfs_create_file("pool_total_size", 0444,